In [13]:

import pandas as pd
import numpy as np

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer


In [14]:

models = {
    'SVC': (SVC(), {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    }),
    'RandomForest': (RandomForestClassifier(), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [None, 10, 20]
    }),
    'LogisticRegression': (LogisticRegression(max_iter=500), {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['lbfgs']
    }),
    'Perceptron': (Perceptron(), {
        'classifier__penalty': [None, 'l2'],
        'classifier__max_iter': [500]
    }),
    'KNN': (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5, 7]
    })
}


 Iris Dataset

In [15]:

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for fold in [3, 5, 7]:
    print(f"\n{'='*40}\nIris Dataset – GridSearchCV ({fold}-fold)\n{'='*40}")
    for name, (model, params) in models.items():
        print(f"\nModel: {name}")
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('classifier', model)
        ])
        grid = GridSearchCV(pipe, param_grid=params, cv=fold)
        grid.fit(X_train, y_train)
        print("Best params:", grid.best_params_)
        print("Train Score:", grid.best_score_)
        print("Test Score:", grid.score(X_test, y_test))



Iris Dataset – GridSearchCV (3-fold)

Model: SVC
Best params: {'classifier__C': 10, 'classifier__kernel': 'rbf'}
Train Score: 0.9249999999999999
Test Score: 0.9

Model: RandomForest
Best params: {'classifier__max_depth': None, 'classifier__n_estimators': 50}
Train Score: 0.8916666666666666
Test Score: 0.9333333333333333

Model: LogisticRegression
Best params: {'classifier__C': 1, 'classifier__solver': 'lbfgs'}
Train Score: 0.9083333333333333
Test Score: 0.9

Model: Perceptron
Best params: {'classifier__max_iter': 500, 'classifier__penalty': None}
Train Score: 0.8416666666666667
Test Score: 0.9666666666666667

Model: KNN
Best params: {'classifier__n_neighbors': 7}
Train Score: 0.9333333333333332
Test Score: 0.8666666666666667

Iris Dataset – GridSearchCV (5-fold)

Model: SVC
Best params: {'classifier__C': 1, 'classifier__kernel': 'rbf'}
Train Score: 0.925
Test Score: 0.9

Model: RandomForest
Best params: {'classifier__max_depth': 20, 'classifier__n_estimators': 50}
Train Score: 0.90833

In [16]:

print("\nRandomizedSearchCV on Iris Dataset")
# RandomForest
pipe_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', RandomForestClassifier())
])
params_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30]
}
rand_rf = RandomizedSearchCV(pipe_rf, param_distributions=params_rf, n_iter=4, cv=5)
rand_rf.fit(X_train, y_train)
print("RandomForest Best:", rand_rf.best_params_, rand_rf.score(X_test, y_test))

# LogisticRegression
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', LogisticRegression(max_iter=1000))
])
params_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['lbfgs']
}
rand_lr = RandomizedSearchCV(pipe_lr, param_distributions=params_lr, n_iter=4, cv=5)
rand_lr.fit(X_train, y_train)
print("LogisticRegression Best:", rand_lr.best_params_, rand_lr.score(X_test, y_test))



RandomizedSearchCV on Iris Dataset
RandomForest Best: {'classifier__n_estimators': 200, 'classifier__max_depth': None} 0.9
LogisticRegression Best: {'classifier__solver': 'lbfgs', 'classifier__C': 1} 0.9


 Dataset (CC GENERAL.csv)

In [17]:

# Load and process data
data = pd.read_csv("CC GENERAL.csv")
data = data.drop(columns=['CUST_ID'])

# Impute missing
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data)

X = pd.DataFrame(data_imputed, columns=data.columns)
y = pd.qcut(data['PURCHASES'], q=3, labels=[0, 1, 2])  # Convert continuous PURCHASES to classification

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GridSearch on folds
for fold in [3, 5, 7]:
    print(f"\n{'='*40}\nCC GENERAL – GridSearchCV ({fold}-fold)\n{'='*40}")
    for name, (model, params) in models.items():
        print(f"\nModel: {name}")
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=5)),
            ('classifier', model)
        ])
        grid = GridSearchCV(pipe, param_grid=params, cv=fold)
        grid.fit(X_train, y_train)
        print("Best params:", grid.best_params_)
        print("Train Score:", grid.best_score_)
        print("Test Score:", grid.score(X_test, y_test))



CC GENERAL – GridSearchCV (3-fold)

Model: SVC
Best params: {'classifier__C': 10, 'classifier__kernel': 'rbf'}
Train Score: 0.8470667405042658
Test Score: 0.8452513966480447

Model: RandomForest
Best params: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
Train Score: 0.8400834804993472
Test Score: 0.8497206703910615

Model: LogisticRegression
Best params: {'classifier__C': 10, 'classifier__solver': 'lbfgs'}
Train Score: 0.8120105610709402
Test Score: 0.8039106145251397

Model: Perceptron
Best params: {'classifier__max_iter': 500, 'classifier__penalty': 'l2'}
Train Score: 0.7599212250673734
Test Score: 0.6977653631284916

Model: KNN
Best params: {'classifier__n_neighbors': 7}
Train Score: 0.8189939381297573
Test Score: 0.8296089385474861

CC GENERAL – GridSearchCV (5-fold)

Model: SVC
Best params: {'classifier__C': 10, 'classifier__kernel': 'rbf'}
Train Score: 0.8494413407821229
Test Score: 0.8452513966480447

Model: RandomForest
Best params: {'classifier__max_depth':

In [18]:

print("\nRandomizedSearchCV on CC GENERAL Dataset")
# RandomForest
pipe_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('classifier', RandomForestClassifier())
])
params_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30]
}
rand_rf = RandomizedSearchCV(pipe_rf, param_distributions=params_rf, n_iter=4, cv=5)
rand_rf.fit(X_train, y_train)
print("RandomForest Best:", rand_rf.best_params_, rand_rf.score(X_test, y_test))

# LogisticRegression
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('classifier', LogisticRegression(max_iter=1000))
])
params_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['lbfgs']
}
rand_lr = RandomizedSearchCV(pipe_lr, param_distributions=params_lr, n_iter=4, cv=5)
rand_lr.fit(X_train, y_train)
print("LogisticRegression Best:", rand_lr.best_params_, rand_lr.score(X_test, y_test))



RandomizedSearchCV on CC GENERAL Dataset
RandomForest Best: {'classifier__n_estimators': 200, 'classifier__max_depth': 10} 0.8435754189944135
LogisticRegression Best: {'classifier__solver': 'lbfgs', 'classifier__C': 10} 0.8039106145251397
