# 1. Imports & Settings

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings

### Set up

In [None]:
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

pd.set_option('display.max_columns',None)
# pd.set_option('display.max_rows',None)

pd.plotting.register_matplotlib_converters()
sns.set(style="whitegrid", palette="muted", font_scale=1.1)
plt.rcParams["figure.figsize"] = (10, 6)
%matplotlib inline

# 2. Load Data

In [None]:
filepath = ""
filepath_test = ""
df = pd.read_csv(filepath)
df_test = pd.read_csv(filepath_test)
df.head()

# 3. Quick Data Info

In [None]:
print(df.shape)
print(df.info())
df.describe().T

# 4. EDA

In [None]:
# Missing values
print(df.isnull().sum())
# Duplicates
print(df.duplicated().sum())

In [None]:
# Target distribution
sns.countplot(x="target", data=df)
plt.show()

# Numerical distributions
df.hist(bins=30, figsize=(15, 10))
plt.show()

# Correlation heatmap
sns.heatmap(df.corr(), annot=False, cmap="coolwarm")
plt.show()

# 5. Data Preparation

### Split

In [None]:
# Feature / Target split
X = df.drop("target", axis=1)
y = df["target"]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6. Baseline Model

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.show()

# 7. Next Steps

In [None]:
# - Try different models
# - Hyperparameter tuning
# - Feature selection / engineering
# - Cross-validation