In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

### Loading the dataset

In [3]:
df=pd.read_csv("UCI_Credit_Card.csv")

In [5]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


### Checking and removing missing values

In [7]:
df.isnull().sum()

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

In [9]:
df_cleaned=df.dropna()
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  float64
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  float64
 13  BILL_AMT2                   300

### Data scaling and Feature selection

In [20]:
X = df.drop(columns=['ID', 'default.payment.next.month'])
y = df['default.payment.next.month']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X_scaled, y)

### Train-Test data splitting

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

### Applying models

In [26]:
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier()
}

### Evaluating performance

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

results = {}
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Evaluate accuracy, precision, and recall
    results[model_name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred)
    }

### Printing results

In [30]:
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")
    print()

Decision Tree:
  Accuracy: 0.7310
  Precision: 0.3834
  Recall: 0.3770

SVM:
  Accuracy: 0.8212
  Precision: 0.6739
  Recall: 0.3542

Logistic Regression:
  Accuracy: 0.8097
  Precision: 0.7070
  Recall: 0.2224

KNN:
  Accuracy: 0.7937
  Precision: 0.5423
  Recall: 0.3663



### Applying GridSearch CV and printing the results

In [None]:
# Data scaler and feature selector
scaler = StandardScaler()
select_kbest = SelectKBest(score_func=f_classif, k=10)

# Define models
models = {
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

# Hyperparameter grids
param_grids = {
    'knn': {
        'knn__n_neighbors': [3, 5, 7, 9],
        'knn__weights': ['uniform', 'distance'],
        'knn__metric': ['euclidean', 'manhattan']
    },
    'decisiontree': {
        'decisiontree__max_depth': [5, 10, 15, 20],
        'decisiontree__min_samples_split': [2, 5, 10]
    },
    'svm': {
        'svm__C': [0.1, 1, 10],
        'svm__kernel': ['linear', 'rbf']
    },
    'logisticregression': {
        'lr__C': [0.1, 1, 10],
        'lr__penalty': ['l2']
    }
}

# Loop through models and apply GridSearchCV
for model_name, model in models.items():
    print(f"\nRunning GridSearchCV for {model_name}")

    # Create a pipeline with scaling, feature selection, and model
    pipeline = Pipeline([
        ('scaler', scaler),
        ('select_kbest', select_kbest),
        (model_name.lower().replace(' ', ''), model)  # Ensure lowercase and no space
    ])

    # Define parameter grid for the current model
    param_grid = param_grids[model_name.lower().replace(' ', '')]

    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Output best parameters and cross-validation score
    print(f"Best hyperparameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation score for {model_name}: {grid_search.best_score_:.4f}")

    # Predict on the test set
    y_pred = grid_search.predict(X_test)

    # Output classification report
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}")


Running GridSearchCV for KNN
Best hyperparameters for KNN: {'knn__metric': 'euclidean', 'knn__n_neighbors': 9, 'knn__weights': 'uniform'}
Best cross-validation score for KNN: 0.8085
Classification Report for KNN:
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      4687
           1       0.59      0.36      0.45      1313

    accuracy                           0.81      6000
   macro avg       0.72      0.65      0.67      6000
weighted avg       0.78      0.81      0.79      6000


Running GridSearchCV for Decision Tree
Best hyperparameters for Decision Tree: {'decisiontree__max_depth': 5, 'decisiontree__min_samples_split': 2}
Best cross-validation score for Decision Tree: 0.8211
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      4687
           1       0.67      0.36      0.47      1313

    accuracy                           0.82      60