In [None]:
# For ML models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# For creating pipelines and performing grid search
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

# For evaluating model performance
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve

# For feature selection methods
from sklearn.feature_selection import SelectKBest, chi2, RFE

# For Standardization and scaling
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Case Studies/HSBC/datasets/clustered_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,CustGender,CustLocation,CustAccountBalance,TransactionAmount,Age,BalTransRatio,TransactionMonth,Cluster
0,2,0,MUMBAI,17874.44,459.0,29,0.025679,2,2
1,6,0,MUMBAI,973.46,566.0,33,0.581431,2,1
2,8,0,GURGAON,14906.96,833.0,37,0.05588,2,2
3,9,1,MUMBAI,4279.22,289.11,41,0.067561,2,1
4,10,1,MOHALI,48429.49,259.0,43,0.005348,2,0


In [None]:
data.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
#Assign features and target
X = data.drop(columns=['Cluster','CustLocation'],axis=1)
y = data['Cluster']

In [None]:
#Split the data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
#Feature Selection
selector = SelectKBest(chi2,k= (data.shape[1]//2))
selector

In [None]:
#check which features are actually important
X_new = selector.fit_transform(X_train,y_train)
selected_features_mask = selector.get_support()
selected_columns = X.columns[selected_features_mask]
selected_columns

Index(['CustAccountBalance', 'TransactionAmount', 'Age', 'BalTransRatio'], dtype='object')

In [None]:
X_test_selected = selector.transform(X_test)
X_test_selected

array([[6.54950000e+02, 4.63000000e+02, 3.40000000e+01, 7.06924193e-01],
       [3.76271500e+04, 2.77000000e+02, 4.70000000e+01, 7.36170558e-03],
       [1.27498700e+04, 1.50000000e+02, 3.90000000e+01, 1.17648258e-02],
       ...,
       [1.94617400e+04, 2.60000000e+02, 3.40000000e+01, 1.33595454e-02],
       [1.59742900e+04, 2.41000000e+02, 3.90000000e+01, 1.50867425e-02],
       [1.95587400e+04, 2.00000000e+02, 4.10000000e+01, 1.02256076e-02]])

In [None]:
# Define models
models = [
    ("LogisticRegression", LogisticRegression()),
    ("DecisionTreeClassifier", DecisionTreeClassifier()),
    ("RandomForestClassifier", RandomForestClassifier()),
    ("GradientBoostingClassifier", GradientBoostingClassifier()),
    ("AdaBoostClassifier", AdaBoostClassifier()),
    ("KNeighborsClassifier", KNeighborsClassifier()),
    ("SVC", SVC(probability=True)),
    ("GaussianNB", GaussianNB())
]

# Evaluate models
for name, model in models:
    print(f"Training {name}...")
    pipe = Pipeline([
        ('selector', SelectKBest(score_func=chi2, k=(data.shape[1]//2))),
        ('classifier', model)
    ])

    trained = pipe.fit(X_train, y_train)
    y_pred = trained.predict(X_test)

    # Evaluate the model
    print(f"\nResults for {name}:")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print(f"\nAccuracy Score: {accuracy_score(y_test, y_pred)}")

    # Plot ROC curve if applicable (for binary classification)
    if len(np.unique(y_test)) == 2:
        y_proba = trained.predict_proba(X_test)[:, 1]
        fpr, tpr, thresholds = roc_curve(y_test, y_proba)
        plt.figure()
        plt.plot(fpr, tpr, label=f'ROC curve for {name}')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve for {name}')
        plt.legend()
        plt.show()

Training LogisticRegression...

Results for LogisticRegression:
Confusion Matrix:
[[ 8186     0   851]
 [    0 31432  1189]
 [  995  1003 15123]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      9037
           1       0.97      0.96      0.97     32621
           2       0.88      0.88      0.88     17121

    accuracy                           0.93     58779
   macro avg       0.91      0.92      0.92     58779
weighted avg       0.93      0.93      0.93     58779


Accuracy Score: 0.9313019956106773
Training DecisionTreeClassifier...

Results for DecisionTreeClassifier:
Confusion Matrix:
[[ 9037     0     0]
 [    0 32621     0]
 [    1     0 17120]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9037
           1       1.00      1.00      1.00     32621
           2       1.00      1.00      1.00     17121

    accuracy     