## In this notebook i'll try clustering then applying SGD (linear for high n_samples)

In [1]:
import pandas as pd

df = pd.read_csv('data/train_pp.csv')
test_df = pd.read_csv('data/test_pp.csv')

X = df.drop(['accident_risk', 'id'], axis=1)
y = df['accident_risk']

def submission_generator(trained_model):
    test_df_preprocessed = test_df.drop('id', axis=1)
    return pd.concat([test_df['id'], pd.Series(trained_model.predict(test_df_preprocessed))], axis=1).rename({0: 'accident_risk'}, axis=1)

## Clustering

In [3]:
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error
import numpy as np


In [None]:
range_n_clusters = [2, 4, 6, 8, 10]
range_n_clusters = [8]
kf = KFold(n_splits=5, shuffle=True)

for n_clusters in range_n_clusters:
    print(f"\n=== n_clusters = {n_clusters} ===")
    fold_scores = []

    for train_idx, test_idx in kf.split(X):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_test = X.iloc[test_idx]
        y_test = y.iloc[test_idx]

        kmeans = KMeans(n_clusters=n_clusters)
        cluster_labels_train = kmeans.fit_predict(X_train)
        cluster_labels_test = kmeans.predict(X_test)

        preds = np.zeros_like(y_test, dtype=float)

        for c in range(n_clusters):
            idx_train = np.where(cluster_labels_train == c)[0]
            idx_test = np.where(cluster_labels_test == c)[0]

            if len(idx_train) == 0 or len(idx_test) == 0:
                continue 

            sgd = SGDRegressor(
                penalty=None,
                alpha = 4.223601774273774,
                l1_ratio=0.8940821225850496,
                max_iter=5_000
            )
            sgd.fit(X_train.iloc[idx_train], y_train.iloc[idx_train])
            preds[idx_test] = sgd.predict(X_test.iloc[idx_test])

        fold_score = root_mean_squared_error(y_test, preds)
        print(fold_score)
        fold_scores.append(fold_score)

    print(f"Mean CV root mean squared for {n_clusters} clusters: {np.mean(fold_scores):.4f}")


=== n_clusters = 8 ===


In [30]:
fold_scores

[]

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import (
    KMeans, MiniBatchKMeans, AgglomerativeClustering,
    SpectralClustering, DBSCAN
)
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error

# Example ranges (you can modify)
range_n_clusters = [2, 4, 6, 8, 10]
range_n_clusters = [2]
kf = KFold(n_splits=5, shuffle=True)

# Define clustering methods
clustering_methods = {
    # "KMeans": KMeans,
    # "MiniBatchKMeans": MiniBatchKMeans,
    # "GaussianMixture": GaussianMixture,
    # "AgglomerativeClustering": AgglomerativeClustering,
    # "SpectralClustering": SpectralClustering,
    "DBSCAN": DBSCAN
}

for method_name, ClusteringModel in clustering_methods.items():
    print(f"\n{'='*20} {method_name} {'='*20}")

    # Handle DBSCAN separately (no n_clusters parameter)
    if method_name == "DBSCAN":
        eps_values = [0.3, 0.5, 0.7]
        min_samples_values = [3, 5]

        for eps in eps_values:
            for min_samples in min_samples_values:
                print(f"\n--- DBSCAN (eps={eps}, min_samples={min_samples}) ---")
                fold_scores = []

                for train_idx, test_idx in kf.split(X):
                    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

                    clusterer = ClusteringModel(eps=eps, min_samples=min_samples)
                    cluster_labels_train = clusterer.fit_predict(X_train)

                    # DBSCAN doesn’t support .predict(), so re-fit on combined data
                    clusterer_full = ClusteringModel(eps=eps, min_samples=min_samples)
                    cluster_labels = clusterer_full.fit_predict(pd.concat([X_train, X_test]))
                    cluster_labels_test = cluster_labels[len(X_train):]

                    preds = np.zeros_like(y_test, dtype=float)
                    unique_clusters = np.unique(cluster_labels_train[cluster_labels_train != -1])

                    for c in unique_clusters:
                        idx_train = np.where(cluster_labels_train == c)[0]
                        idx_test = np.where(cluster_labels_test == c)[0]

                        if len(idx_train) == 0 or len(idx_test) == 0:
                            continue

                        sgd = SGDRegressor(
                            penalty=None,
                            alpha=4.223601774273774,
                            l1_ratio=0.8940821225850496,
                            max_iter=5000
                        )
                        sgd.fit(X_train.iloc[idx_train], y_train.iloc[idx_train])
                        preds[idx_test] = sgd.predict(X_test.iloc[idx_test])

                    fold_score = root_mean_squared_error(y_test, preds)
                    print(f"Fold RMSE: {fold_score:.4f}")
                    fold_scores.append(fold_score)

                print(f"Mean CV RMSE (eps={eps}, min_samples={min_samples}): {np.mean(fold_scores):.4f}")

    # Handle normal clustering methods
    else:
        for n_clusters in range_n_clusters:
            print(f"\n--- n_clusters = {n_clusters} ---")
            fold_scores = []

            for train_idx, test_idx in kf.split(X):
                X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

                # Instantiate clustering model
                if method_name == "GaussianMixture":
                    clusterer = ClusteringModel(n_components=n_clusters, random_state=42)
                    cluster_labels_train = clusterer.fit_predict(X_train)
                    cluster_labels_test = clusterer.predict(X_test)
                elif method_name in ["AgglomerativeClustering", "SpectralClustering"]:
                    clusterer = ClusteringModel(n_clusters=n_clusters)
                    cluster_labels = clusterer.fit_predict(pd.concat([X_train, X_test]))
                    cluster_labels_train = cluster_labels[:len(X_train)]
                    cluster_labels_test = cluster_labels[len(X_train):]
                else:
                    clusterer = ClusteringModel(n_clusters=n_clusters, random_state=42)
                    cluster_labels_train = clusterer.fit_predict(X_train)
                    cluster_labels_test = clusterer.predict(X_test)

                preds = np.zeros_like(y_test, dtype=float)

                for c in range(n_clusters):
                    idx_train = np.where(cluster_labels_train == c)[0]
                    idx_test = np.where(cluster_labels_test == c)[0]

                    if len(idx_train) == 0 or len(idx_test) == 0:
                        continue

                    sgd = SGDRegressor(
                        penalty=None,
                        alpha=4.223601774273774,
                        l1_ratio=0.8940821225850496,
                        max_iter=5000
                    )
                    sgd.fit(X_train.iloc[idx_train], y_train.iloc[idx_train])
                    preds[idx_test] = sgd.predict(X_test.iloc[idx_test])

                fold_score = root_mean_squared_error(y_test, preds)
                print(f"Fold RMSE: {fold_score:.4f}")
                fold_scores.append(fold_score)

            print(f"Mean CV RMSE for {n_clusters} clusters: {np.mean(fold_scores):.4f}")



--- DBSCAN (eps=0.3, min_samples=3) ---




Fold RMSE: 4966493344438.1250




Fold RMSE: 4950390732226.8760


==================== KMeans ====================

--- n_clusters = 2 ---
Fold RMSE: 0.3478
Fold RMSE: 0.4287
Fold RMSE: 0.4337
Fold RMSE: 0.4720
Fold RMSE: 0.4335
Mean CV RMSE for 2 clusters: 0.4232

==================== MiniBatchKMeans ====================

--- n_clusters = 2 ---
Fold RMSE: 0.4418
Fold RMSE: 0.4515
Fold RMSE: 0.4592
Fold RMSE: 0.3536
Fold RMSE: 0.3318
Mean CV RMSE for 2 clusters: 0.4076

==================== GaussianMixture ====================

--- n_clusters = 2 ---
Fold RMSE: 0.1868
Fold RMSE: 0.2020
Fold RMSE: 0.1998
Fold RMSE: 0.1889
Fold RMSE: 0.2110
Mean CV RMSE for 2 clusters: 0.1977