# classification example

## problem statement
- find out if a customer will leave the company (churning)

### reading material

https://scikit-learn.org/1.3/tutorial/machine_learning_map/

### import required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

### load the data

In [2]:
df = pd.read_csv('Churn.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# get the columns
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
# remove the unwanted columns
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

In [6]:
from sklearn.preprocessing import LabelEncoder

# transform Gender from non-numeric to numeric
label_encoder_gender = LabelEncoder()
df['Gender'] = label_encoder_gender.fit_transform(df['Gender'])

# transform Geography from non-numeric to numeric
label_encoder_geography = LabelEncoder()
df['Geography'] = label_encoder_geography.fit_transform(df['Geography'])

In [7]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,850,2,0,43,2,125510.82,1,1,1,79084.1,0


### split the data

In [8]:
x = df.drop('Exited', axis=1)
y = df['Exited']

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=123456)

### build decision tree model

In [10]:
from sklearn.tree import DecisionTreeClassifier

# create empty model
model_dt = DecisionTreeClassifier()

# train the model
model_dt.fit(x_train, y_train)

### build logistic regression model

In [11]:
from sklearn.linear_model import LogisticRegressionCV

# create empty model
model_lr = LogisticRegressionCV(max_iter=1000)

# train the model
model_lr.fit(x_train, y_train)

### build random forest model

In [12]:
from sklearn.ensemble import RandomForestClassifier

# create empty model
model_rf = RandomForestClassifier(n_estimators=100)

# train the model
model_rf.fit(x_train, y_train)

### build XGB model

In [13]:
# install xgboost
%pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.10/bin/python3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [14]:
from xgboost import XGBClassifier

# create empty model
model_xgb = XGBClassifier()

# train the model
model_xgb.fit(x_train, y_train)

### build SVM model

In [15]:
from sklearn.svm import SVC

# create empty model
model_svc = SVC(kernel="sigmoid")

# train the model
model_svc.fit(x_train, y_train)

### tuning the hyper parameters (for optimizing the model)

In [None]:
from sklearn.model_selection import GridSearchCV

# create parameters for searching the best combinations
parameters = {'kernel':('linear', 'rbf', 'sigmoid'), 'C':[1, 2]}

# create object GridSearch
grid_search = GridSearchCV(model_svc, parameters)

# fit the values 
grid_search.fit(x_train, y_train)

### model evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model_name, model):
    y_pred = model.predict(x_test)

    # model.score() =>
    # - accuracy for classification model
    # - r2 score for regression model
    score = model.score(x_test, y_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return (model_name, score, accuracy, precision, recall, f1)

In [None]:
results = []
results.append(evaluate_model("Logistic Regression", model_lr))
results.append(evaluate_model("Decision Tree", model_dt))
results.append(evaluate_model("Random Forest", model_rf))
results.append(evaluate_model("XG Boost", model_xgb))
results.append(evaluate_model("Support Vector Machine", model_svc))

In [None]:
result_df = pd.DataFrame(results, columns=["Algorithm", "Score",  "Accuracy", "Precision", "Recall", "F1"])
result_df