In [119]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score,f1_score

In [98]:
pip install xgboost catboost

Note: you may need to restart the kernel to use updated packages.


In [99]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [100]:
df=pd.read_csv('churn_data.csv')

In [101]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,MaritalStatus,IncomeLevel,LastLoginDate,LoginFrequency,ServiceUsage,ChurnStatus
0,1,62,M,Single,Low,2023-10-21,34,Mobile App,0
1,2,65,M,Married,Low,2023-12-05,5,Website,1
2,3,18,M,Single,Low,2023-11-15,3,Website,0
3,4,21,M,Widowed,Low,2023-08-25,2,Website,0
4,5,21,M,Divorced,Medium,2023-10-27,41,Website,0


In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CustomerID      1000 non-null   int64 
 1   Age             1000 non-null   int64 
 2   Gender          1000 non-null   object
 3   MaritalStatus   1000 non-null   object
 4   IncomeLevel     1000 non-null   object
 5   LastLoginDate   1000 non-null   object
 6   LoginFrequency  1000 non-null   int64 
 7   ServiceUsage    1000 non-null   object
 8   ChurnStatus     1000 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 70.4+ KB


In [103]:
df.drop('CustomerID', axis=1, inplace=True)

In [104]:
df.drop('LastLoginDate', axis=1, inplace=True)

In [105]:
df.head()

Unnamed: 0,Age,Gender,MaritalStatus,IncomeLevel,LoginFrequency,ServiceUsage,ChurnStatus
0,62,M,Single,Low,34,Mobile App,0
1,65,M,Married,Low,5,Website,1
2,18,M,Single,Low,3,Website,0
3,21,M,Widowed,Low,2,Website,0
4,21,M,Divorced,Medium,41,Website,0


In [106]:
X = df.drop(['ChurnStatus'],axis=1)
y = df['ChurnStatus']

In [107]:
X.head()

Unnamed: 0,Age,Gender,MaritalStatus,IncomeLevel,LoginFrequency,ServiceUsage
0,62,M,Single,Low,34,Mobile App
1,65,M,Married,Low,5,Website
2,18,M,Single,Low,3,Website
3,21,M,Widowed,Low,2,Website
4,21,M,Divorced,Medium,41,Website


In [108]:
y.head()

0    0
1    1
2    0
3    0
4    0
Name: ChurnStatus, dtype: int64

In [109]:
custom_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
X['IncomeLevel'] = X['IncomeLevel'].map(custom_mapping)

In [110]:
X.head()

Unnamed: 0,Age,Gender,MaritalStatus,IncomeLevel,LoginFrequency,ServiceUsage
0,62,M,Single,0,34,Mobile App
1,65,M,Married,0,5,Website
2,18,M,Single,0,3,Website
3,21,M,Widowed,0,2,Website
4,21,M,Divorced,1,41,Website


In [111]:
X.isnull().sum()

Age               0
Gender            0
MaritalStatus     0
IncomeLevel       0
LoginFrequency    0
ServiceUsage      0
dtype: int64

In [112]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define categorical and numerical columns
cat_features = ['Gender', 'MaritalStatus', 'ServiceUsage']
num_features = ['Age', 'LoginFrequency', 'IncomeLevel']

# Create the transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop=None, sparse_output=False), cat_features),
        ('num', StandardScaler(), num_features)
    ],
    remainder='passthrough'
)

# Fit and transform your data
X_processed = preprocessor.fit_transform(X)

# Get feature names (scikit-learn >=1.0)
feature_names = preprocessor.get_feature_names_out()

# Convert to DataFrame
X = pd.DataFrame(X_processed, columns=feature_names)

In [113]:
X.head()

Unnamed: 0,cat__Gender_F,cat__Gender_M,cat__MaritalStatus_Divorced,cat__MaritalStatus_Married,cat__MaritalStatus_Single,cat__MaritalStatus_Widowed,cat__ServiceUsage_Mobile App,cat__ServiceUsage_Online Banking,cat__ServiceUsage_Website,num__Age,num__LoginFrequency,num__IncomeLevel
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.229628,0.575702,-1.247831
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.426547,-1.488513,-1.247831
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.658518,-1.630872,-1.247831
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.461599,-1.702052,-1.247831
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.461599,1.073961,-0.029246


In [114]:
from sklearn.model_selection import train_test_split

# Train-test split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Confirm shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (800, 12)
X_test shape: (200, 12)
y_train shape: (800,)
y_test shape: (200,)


In [121]:
# Dictionary of models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Support Vector Machine': SVC(probability=True, class_weight='balanced'),
    'k-Nearest Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(verbose=0)
}


In [127]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score

results = {}
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Calculate metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    }
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    metrics['Cross-Validation F1 (Mean)'] = cv_scores.mean()
    
    results[name] = metrics

# Display results
results_df = pd.DataFrame(results).T
print(results_df)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


                        Accuracy  Precision  Recall  F1-Score   ROC-AUC  \
Logistic Regression        0.530   0.225000    0.36  0.276923  0.489600   
Decision Tree              0.660   0.295455    0.26  0.276596  0.526667   
Random Forest              0.725   0.142857    0.02  0.035088  0.583867   
Gradient Boosting          0.725   0.222222    0.04  0.067797  0.558333   
Support Vector Machine     0.610   0.310811    0.46  0.370968  0.575200   
k-Nearest Neighbors        0.730   0.300000    0.06  0.100000  0.541067   
XGBoost                    0.730   0.400000    0.16  0.228571  0.582933   
CatBoost                   0.745   0.333333    0.02  0.037736  0.569733   

                        Cross-Validation F1 (Mean)  
Logistic Regression                       0.295947  
Decision Tree                             0.249642  
Random Forest                             0.095699  
Gradient Boosting                         0.116695  
Support Vector Machine                    0.279473  
k-Near

In [134]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
model = CatBoostClassifier(verbose=0)  # verbose=0 suppresses training logs

# Define the parameter grid to search
param_grid = {
    'iterations': [100, 200],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1]
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='f1',  # Optimize for F1-Score (adjust based on your goal)
    n_jobs=-1  # Use all available CPU cores
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Evaluate on test data (if you have X_test_scaled and y_test)
y_pred = best_model.predict(X_test)
from sklearn.metrics import f1_score
print("F1-Score on test data:", f1_score(y_test, y_pred))

Best parameters: {'depth': 6, 'iterations': 200, 'learning_rate': 0.1}
F1-Score on test data: 0.03508771929824561


In [135]:
model = CatBoostClassifier(depth=6, iterations=200, learning_rate=0.1, verbose=0, scale_pos_weight=10)
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x26841220ad0>

In [137]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Predict on test data
y_pred = model.predict(X_test)  # Ensure X_test_scaled is used, not X_test

# Calculate and print metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy: 0.63
Precision: 0.29310344827586204
Recall: 0.34


In [132]:
feature_importance = model.get_feature_importance()
print(pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance}).sort_values(by='Importance', ascending=False))

                             Feature  Importance
10               num__LoginFrequency   28.174453
9                           num__Age   26.776819
11                  num__IncomeLevel   11.530606
0                      cat__Gender_F    4.274755
1                      cat__Gender_M    4.268706
6       cat__ServiceUsage_Mobile App    4.249796
2        cat__MaritalStatus_Divorced    4.097010
3         cat__MaritalStatus_Married    3.633469
7   cat__ServiceUsage_Online Banking    3.441703
5         cat__MaritalStatus_Widowed    3.366644
8          cat__ServiceUsage_Website    3.220388
4          cat__MaritalStatus_Single    2.965650
