# ICP 5 Assignment – Complete

This notebook completes all required tasks:

## ✅ PART 1: Iris Dataset
- GridSearchCV (3, 5, 7 folds)
- Multiple classifiers (SVC, RF, LR, Perceptron, KNN)
- RandomizedSearchCV on 2 models

## ✅ PART 2: Real Dataset (`CC GENERAL.csv`)
- Same steps on real-world credit card customer data

In [None]:

import pandas as pd
import numpy as np

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer


In [None]:

models = {
    'SVC': (SVC(), {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    }),
    'RandomForest': (RandomForestClassifier(), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [None, 10, 20]
    }),
    'LogisticRegression': (LogisticRegression(max_iter=500), {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['lbfgs']
    }),
    'Perceptron': (Perceptron(), {
        'classifier__penalty': [None, 'l2'],
        'classifier__max_iter': [500]
    }),
    'KNN': (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5, 7]
    })
}


## 🌸 PART 1: Iris Dataset

In [None]:

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for fold in [3, 5, 7]:
    print(f"\n{'='*40}\nIris Dataset – GridSearchCV ({fold}-fold)\n{'='*40}")
    for name, (model, params) in models.items():
        print(f"\nModel: {name}")
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('classifier', model)
        ])
        grid = GridSearchCV(pipe, param_grid=params, cv=fold)
        grid.fit(X_train, y_train)
        print("Best params:", grid.best_params_)
        print("Train Score:", grid.best_score_)
        print("Test Score:", grid.score(X_test, y_test))


In [None]:

print("\nRandomizedSearchCV on Iris Dataset")
# RandomForest
pipe_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', RandomForestClassifier())
])
params_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30]
}
rand_rf = RandomizedSearchCV(pipe_rf, param_distributions=params_rf, n_iter=4, cv=5)
rand_rf.fit(X_train, y_train)
print("RandomForest Best:", rand_rf.best_params_, rand_rf.score(X_test, y_test))

# LogisticRegression
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', LogisticRegression(max_iter=1000))
])
params_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['lbfgs']
}
rand_lr = RandomizedSearchCV(pipe_lr, param_distributions=params_lr, n_iter=4, cv=5)
rand_lr.fit(X_train, y_train)
print("LogisticRegression Best:", rand_lr.best_params_, rand_lr.score(X_test, y_test))


## 💳 PART 2: Real Dataset (CC GENERAL.csv)

In [None]:

# Load and process data
data = pd.read_csv("CC GENERAL.csv")
data = data.drop(columns=['CUST_ID'])

# Impute missing
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data)

X = pd.DataFrame(data_imputed, columns=data.columns)
y = pd.qcut(data['PURCHASES'], q=3, labels=[0, 1, 2])  # Convert continuous PURCHASES to classification

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GridSearch on folds
for fold in [3, 5, 7]:
    print(f"\n{'='*40}\nCC GENERAL – GridSearchCV ({fold}-fold)\n{'='*40}")
    for name, (model, params) in models.items():
        print(f"\nModel: {name}")
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=5)),
            ('classifier', model)
        ])
        grid = GridSearchCV(pipe, param_grid=params, cv=fold)
        grid.fit(X_train, y_train)
        print("Best params:", grid.best_params_)
        print("Train Score:", grid.best_score_)
        print("Test Score:", grid.score(X_test, y_test))


In [None]:

print("\nRandomizedSearchCV on CC GENERAL Dataset")
# RandomForest
pipe_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('classifier', RandomForestClassifier())
])
params_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30]
}
rand_rf = RandomizedSearchCV(pipe_rf, param_distributions=params_rf, n_iter=4, cv=5)
rand_rf.fit(X_train, y_train)
print("RandomForest Best:", rand_rf.best_params_, rand_rf.score(X_test, y_test))

# LogisticRegression
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('classifier', LogisticRegression(max_iter=1000))
])
params_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['lbfgs']
}
rand_lr = RandomizedSearchCV(pipe_lr, param_distributions=params_lr, n_iter=4, cv=5)
rand_lr.fit(X_train, y_train)
print("LogisticRegression Best:", rand_lr.best_params_, rand_lr.score(X_test, y_test))
