In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# data manip
import pandas as pd
import numpy as np
# viz
import matplotlib.pyplot as plt
import seaborn as sns

# scikit-learn
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report

# Function for creating model pipelines - sklearn
from sklearn.pipeline import make_pipeline

# Function for creating model pipelines - imblearn
from imblearn.pipeline import make_pipeline as imbl_pipe

# Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE

In [2]:
data = pd.read_csv('../Data/data.csv', sep=',')
data.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
data = data.drop(['customer_id'], axis=1)

## Models Training

In [4]:
#Separate dataframe into separate object

# Object for target variable
y = data.churn

# object for input features
X = data.drop(['churn'], axis=1)

# display shapes of X and y
print(X.shape, y.shape)

(10000, 10) (10000,)


In [5]:
# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

['credit_score',
 'age',
 'tenure',
 'balance',
 'products_number',
 'credit_card',
 'active_member',
 'estimated_salary']

In [6]:
# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()
cat_columns

['country', 'gender']

## Create a Train Test Split

In [7]:
random_state = 10

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=random_state,
                                                    stratify=data.churn)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

7000 3000 7000 3000


In [8]:
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 8061 to 4741
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   credit_score      7000 non-null   int64  
 1   country           7000 non-null   object 
 2   gender            7000 non-null   object 
 3   age               7000 non-null   int64  
 4   tenure            7000 non-null   int64  
 5   balance           7000 non-null   float64
 6   products_number   7000 non-null   int64  
 7   credit_card       7000 non-null   int64  
 8   active_member     7000 non-null   int64  
 9   estimated_salary  7000 non-null   float64
dtypes: float64(2), int64(6), object(2)
memory usage: 601.6+ KB


## Pre-processing Pipeline

### Scale numerical data and encode categorical data

In [9]:
num_features = [] 

for i in num_columns:
    location = X.columns.get_loc(i)
    num_features.append(location)
print(num_features)  

[0, 3, 4, 5, 6, 7, 8, 9]


In [10]:
cat_features = []

for i in cat_columns:
    location = X.columns.get_loc(i)
    cat_features.append(location)
print(cat_features) 

[1, 2]


In [11]:

# Define the column transformer
preprocess = make_column_transformer(
    (MinMaxScaler(), num_features),
    (OneHotEncoder(sparse_output=False), cat_features)  
)


print(preprocess)

ColumnTransformer(transformers=[('minmaxscaler', MinMaxScaler(),
                                 [0, 3, 4, 5, 6, 7, 8, 9]),
                                ('onehotencoder',
                                 OneHotEncoder(sparse_output=False), [1, 2])])


In [12]:
# Import classifier
from sklearn.linear_model import LogisticRegression 

# Define model with pipeline
model = imbl_pipe(preprocess,
                  SMOTE(sampling_strategy='auto', random_state=random_state),
                  LogisticRegression(random_state=random_state))

model

In [13]:
 # Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
lr_param_grid = {
    'logisticregression__C' : [0.01, 0.05, 0.1, 0.5, 1, 5],
    'logisticregression__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
}

lr_grid = GridSearchCV(model, lr_param_grid, verbose=3, cv= 5, scoring='accuracy')

In [14]:
X_train = X_train.values
X_test = X_test.values

In [15]:
X_train

array([[477, 'Spain', 'Male', ..., 0, 1, 184061.17],
       [606, 'Spain', 'Male', ..., 1, 1, 1914.41],
       [793, 'France', 'Male', ..., 0, 0, 83997.79],
       ...,
       [646, 'Germany', 'Male', ..., 1, 0, 45041.32],
       [700, 'France', 'Female', ..., 1, 1, 174971.64],
       [651, 'France', 'Male', ..., 1, 0, 23054.51]], dtype=object)

In [16]:
lr_grid.fit(X_train, y_train)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END logisticregression__C=0.01, logisticregression__solver=liblinear;, score=0.663 total time=   0.0s


[CV 2/5] END logisticregression__C=0.01, logisticregression__solver=liblinear;, score=0.685 total time=   0.0s
[CV 3/5] END logisticregression__C=0.01, logisticregression__solver=liblinear;, score=0.681 total time=   0.0s
[CV 4/5] END logisticregression__C=0.01, logisticregression__solver=liblinear;, score=0.688 total time=   0.0s
[CV 5/5] END logisticregression__C=0.01, logisticregression__solver=liblinear;, score=0.687 total time=   0.0s
[CV 1/5] END logisticregression__C=0.01, logisticregression__solver=newton-cg;, score=0.663 total time=   0.0s
[CV 2/5] END logisticregression__C=0.01, logisticregression__solver=newton-cg;, score=0.686 total time=   0.0s
[CV 3/5] END logisticregression__C=0.01, logisticregression__solver=newton-cg;, score=0.680 total time=   0.0s
[CV 4/5] END logisticregression__C=0.01, logisticregression__solver=newton-cg;, score=0.688 total time=   0.0s
[CV 5/5] END logisticregression__C=0.01, logisticregression__solver=newton-cg;, score=0.689 total time=   0.0s
[



[CV 1/5] END logisticregression__C=5, logisticregression__solver=saga;, score=0.718 total time=   0.2s




[CV 2/5] END logisticregression__C=5, logisticregression__solver=saga;, score=0.713 total time=   0.2s




[CV 3/5] END logisticregression__C=5, logisticregression__solver=saga;, score=0.719 total time=   0.3s




[CV 4/5] END logisticregression__C=5, logisticregression__solver=saga;, score=0.718 total time=   0.1s
[CV 5/5] END logisticregression__C=5, logisticregression__solver=saga;, score=0.731 total time=   0.1s




In [17]:
print(lr_grid.best_params_)

{'logisticregression__C': 5, 'logisticregression__solver': 'liblinear'}


In [18]:
print(lr_grid.best_score_)

0.7197142857142858


In [19]:
print(f"Training Data Score: {lr_grid.score(X_train, y_train)}")
print(f"Testing Data Score: {lr_grid.score(X_test, y_test)}")

Training Data Score: 0.722
Testing Data Score: 0.707


* the score are close which mean no overfitting

In [20]:
predictions = lr_grid.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 1 1 0 0 0 0]
First 10 Actual labels: [1, 0, 0, 0, 0, 1, 0, 0, 0, 0]


In [21]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,0,0
3,0,0
4,1,0
...,...,...
2995,0,0
2996,0,0
2997,0,0
2998,0,0


In [22]:
cm = confusion_matrix(y_test, predictions)
print(cm)

[[1708  681]
 [ 198  413]]


In [23]:
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print(cm)

[[0.71 0.29]
 [0.32 0.68]]


In [24]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.90      0.71      0.80      2389
           1       0.38      0.68      0.48       611

    accuracy                           0.71      3000
   macro avg       0.64      0.70      0.64      3000
weighted avg       0.79      0.71      0.73      3000



In [25]:
pred = lr_grid.predict(X_test[:1])

In [26]:
print(f"Predicted classes: {pred}")
print(f"Actual Labels: {list(y_test[:1])}")

Predicted classes: [0]
Actual Labels: [1]


In [27]:
import joblib

filename = '../models/LR.sav'
joblib.dump(lr_grid, filename)

['../models/LR.sav']

In [28]:
lr_model = joblib.load(filename)
print(lr_model.score(X_test, y_test))

0.707


## Predict class for new data

In [29]:
# Let's use the first X_test record as new data
X_test[:1]

array([[638, 'France', 'Male', 36, 6, 188455.19, 1, 0, 0, 47031.4]],
      dtype=object)

In [30]:

pred_new = lr_grid.predict(X_test[:1])

In [31]:
print(f"Predicted classes: {pred_new}")
print(f"Actual Labels: {list(y_test[:1])}")

Predicted classes: [0]
Actual Labels: [1]
