# 1. Exploratory Data Analysis and Reusable Functions

In [None]:
# Surpress warnings
import warnings
warnings.filterwarnings('ignore')

# Import libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, mean_squared_error, r2_score, confusion_matrix)

# Load CSV
def load_csv(path):
    return pd.read_csv(path)

# Tiền xử lý cơ bản
def basic_clean(df, drop_cols=None):
    df = df.copy()
    if drop_cols:
        df.drop(columns=drop_cols, errors='ignore', inplace=True)
    for c in df.select_dtypes(include=[np.number]).columns:
        df[c].fillna(df[c].median(), inplace=True)
    for c in df.select_dtypes(exclude=[np.number]).columns:
        df[c].fillna(df[c].mode().iloc[0], inplace=True)
    return df

# Xác định X, y theo loại bài toán
def define_xy(df, task_type, target=None):
    if task_type == 'regression':
        X = df.drop(columns=[target])
        y = df[target]
        return X, y

    if task_type == 'classification':
        y_raw = df[target]
        le = LabelEncoder()
        y = le.fit_transform(y_raw)
        X = df.drop(columns=[target])
        return X, y

    if task_type == 'unsupervised':
        return df, None

    raise ValueError("task_type phải là 'regression', 'classification' hoặc 'unsupervised'")

# Chia dữ liệu huấn luyện – validation – test
def split_xy(X, y=None, test_size=0.2, val_size=0.1, random_state=42):
    if y is None:
        return train_test_split(X, test_size=test_size, random_state=random_state)

    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    rel_val = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full,
        test_size=rel_val, random_state=random_state
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

# Scaling
def scale_data(X_train, X_val=None, X_test=None, method='standard'):
    scaler = StandardScaler() if method=='standard' else MinMaxScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_val_s = scaler.transform(X_val) if X_val is not None else None
    X_test_s = scaler.transform(X_test) if X_test is not None else None
    return scaler, X_train_s, X_val_s, X_test_s

# Lưu / tải model
def save_model(model, path):
    joblib.dump(model, path)

def load_model(path):
    return joblib.load(path)

# Các hàm đánh giá chung
def regression_metrics(y_true, y_pred):
    return {'mse': mean_squared_error(y_true,y_pred), 'rmse': np.sqrt(mean_squared_error(y_true,y_pred)), 'r2': r2_score(y_true,y_pred)}

def classification_metrics(y_true, y_pred, y_prob=None):
    out = {
        'accuracy': accuracy_score(y_true,y_pred),
        'precision': precision_score(y_true,y_pred, average='binary' if len(np.unique(y_true))==2 else 'macro', zero_division=0),
        'recall': recall_score(y_true,y_pred, average='binary' if len(np.unique(y_true))==2 else 'macro', zero_division=0),
        'f1': f1_score(y_true,y_pred, average='binary' if len(np.unique(y_true))==2 else 'macro', zero_division=0)
    }
    if y_prob is not None:
        try:
            out['roc_auc'] = roc_auc_score(y_true, y_prob)
        except Exception:
            out['roc_auc'] = None
    return out


# 2. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

# Load & preprocess
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='regression', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)


# Train
model = LinearRegression()
model.fit(X_train_s, y_train)

# Evaluate
pred = model.predict(X_test_s)
print(regression_metrics(y_test, pred))
save_model({'model': model, 'scaler': scaler}, '../models/linear_regression.joblib')

# 3. Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

# Load & preprocess
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='regression', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)

# Polynomial Regression with Grid Search
pipe = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])
param_grid = {'poly__degree': [2,3], 'poly__include_bias':[False]}
search = GridSearchCV(pipe, param_grid, cv=5, scoring='r2')
search.fit(X_train, y_train)
print(search.best_params_, search.best_score_)
save_model(search.best_estimator_, '../models/poly_regression.joblib')

# 4. Regularized Regression

In [None]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

# Load & preprocess
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='regression', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)

# Train & evaluate regularized regression models
models = {
    'ridge': (Ridge(), {'alpha':[0.1,1,10]}),
    'lasso': (Lasso(max_iter=5000), {'alpha':[0.001,0.01,0.1,1]}),
    'elastic': (ElasticNet(max_iter=5000), {'alpha':[0.01,0.1,1], 'l1_ratio':[0.2,0.5,0.8]})
}
for name,(est,params) in models.items():
    grid = GridSearchCV(est, params, cv=5, scoring='neg_mean_squared_error')
    grid.fit(X_train_s, y_train)
    print(name, grid.best_params_, -grid.best_score_)
    save_model(grid.best_estimator_, f'../models/{name}.joblib')

# 5. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# Prepare data 
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='classification', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)

clf = LogisticRegression(max_iter=5000)
param = {'C':[0.01,0.1,1,10], 'penalty':['l2'], 'solver':['lbfgs']}
gs = GridSearchCV(clf, param, cv=5, scoring='roc_auc')
gs.fit(X_train_s, y_train)
print(gs.best_params_, gs.best_score_)
probs = gs.predict_proba(X_test_s)[:,1]
preds = gs.predict(X_test_s)
print(classification_metrics(y_test, preds, probs))
save_model(gs.best_estimator_, '../models/logistic_regression.joblib')

# 6. K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='classification', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)

knn = KNeighborsClassifier()
param = {'n_neighbors':[3,5,7,9], 'weights':['uniform','distance']}
gs = GridSearchCV(knn, param, cv=5, scoring='f1')
gs.fit(X_train_s, y_train)
print(gs.best_params_)
print(classification_metrics(y_test, gs.predict(X_test_s)))
save_model(gs.best_estimator_, '../models/knn.joblib')

# 7. Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='classification', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)

param = [
    {'kernel':['linear'], 'C':[0.1,1,10]},
    {'kernel':['rbf'], 'C':[0.1,1,10], 'gamma':['scale','auto']}
]
svc = SVC(probability=True)
gs = GridSearchCV(svc, param, cv=5, scoring='roc_auc')
gs.fit(X_train_s, y_train)
print(gs.best_params_)
print(classification_metrics(y_test, gs.predict(X_test_s), gs.predict_proba(X_test_s)[:,1]))
save_model(gs.best_estimator_, '../models/svm.joblib')

# 8. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='classification', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)

param = {'max_depth':[3,5,10,None], 'min_samples_leaf':[1,2,5]}
clf = DecisionTreeClassifier()
gs = GridSearchCV(clf, param, cv=5, scoring='f1')
gs.fit(X_train_s, y_train)
print(gs.best_params_)
print(classification_metrics(y_test, gs.predict(X_test_s)))
print('Feature importance:', gs.best_estimator_.feature_importances_)
save_model(gs.best_estimator_, '../models/decision_tree.joblib')

# 9. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='classification', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)

param = {'n_estimators':[100,200], 'max_depth':[None,10,20], 'min_samples_leaf':[1,2]}
rf = RandomForestClassifier(n_jobs=-1)
gs = RandomizedSearchCV(rf, param, n_iter=6, cv=3, scoring='f1')
gs.fit(X_train_s, y_train)
print(gs.best_params_)
print(classification_metrics(y_test, gs.predict(X_test_s)))
save_model(gs.best_estimator_, '../models/random_forest.joblib')

# 10. Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
try:
    import xgboost as xgb
    has_xgb = True
except Exception:
    has_xgb = False

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='classification', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
print(classification_metrics(y_test, gb.predict(X_test)))
if has_xgb:
    xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgb_clf.fit(X_train, y_train)
    print('XGB:', classification_metrics(y_test, xgb_clf.predict(X_test)))
    save_model(xgb_clf, '../models/xgboost.joblib')
save_model(gb, '../models/gradient_boosting.joblib')

# 11. Stacking

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='classification', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)

estimators = [
    ('rf', RandomForestClassifier(n_estimators=50)),
    ('svc', SVC(probability=True))
]
stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stack.fit(X_train_s, y_train)
print(classification_metrics(y_test, stack.predict(X_test_s), stack.predict_proba(X_test_s)[:,1]))
save_model(stack, '../models/stacking.joblib')

# 12. Imbalance Handling

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='classification', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)

# Try oversampling
sm = SMOTE(random_state=42)
rus = RandomUnderSampler(random_state=42)
clf = RandomForestClassifier()
p = ImbPipeline([('smote', sm), ('clf', clf)])
p.fit(X_train, y_train)
print(classification_metrics(y_test, p.predict(X_test)))

# Try class-weight handling
clf2 = RandomForestClassifier(class_weight='balanced')
clf2.fit(X_train, y_train)
print(classification_metrics(y_test, clf2.predict(X_test)))

# Try undersampling
p2 = ImbPipeline([('rus', rus), ('clf', clf)])
p2.fit(X_train, y_train)
print(classification_metrics(y_test, p2.predict(X_test)))

# 13. K-Means Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='unsupervised', target='target')
X_train, X_val, X_test, y_train, y_val, y_test = split_xy(X, y)
scaler, X_train_s, X_val_s, X_test_s = scale_data(X_train, X_val, X_test)


inertias = []
for k in range(2,11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

plt.plot(range(2,11), inertias)
plt.xlabel('k')
plt.ylabel('inertia')
plt.show()

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
print('Silhouette:', silhouette_score(X, kmeans.labels_))

# 14. Hierarchical Agglomerative Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.pyplot as plt

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='unsupervised', target='target')

Z = linkage(X, method='ward')
plt.figure(figsize=(10,5))
dendrogram(Z, truncate_mode='level', p=5)
plt.show()

labels = fcluster(Z, t=3, criterion='maxclust')
print('Cluster counts:', np.bincount(labels))

# 15. DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
import numpy as np
import matplotlib.pyplot as plt

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='classification', target='target')

nbrs = NearestNeighbors(n_neighbors=5).fit(X)
dists, _ = nbrs.kneighbors(X)
dists = np.sort(dists[:,4])
plt.plot(dists)
plt.show()

db = DBSCAN(eps=0.5, min_samples=5)
labels = db.fit_predict(X)
print('Unique labels:', np.unique(labels))

# 16. PCA

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Prepare data...
df = load_csv('../data/dataset.csv')
df = basic_clean(df, drop_cols=['id'])
X, y = define_xy(df, task_type='unsupervised', target='target')

pca = PCA()
X_p = pca.fit_transform(X)
explained = np.cumsum(pca.explained_variance_ratio_)
plt.plot(explained)
plt.xlabel('components')
plt.ylabel('cumulative explained variance')
plt.show()

# Preserve 95% variance
n_comp = (explained < 0.95).sum() + 1
pca_red = PCA(n_components=n_comp)
X_reduced = pca_red.fit_transform(X)
print('n_components ->', n_comp)