**Handle imbalanced data in churn prediction. Logistic Regression**

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

**Load the data**


In [3]:
df = pd.read_csv("Churn_Modelling.csv")
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
7766,7767,15647259,Barnett,643,Spain,Male,35,2,0.0,2,0,0,67979.35,0
6467,6468,15641782,Humphries,540,France,Female,31,7,0.0,1,0,1,183051.6,1
3192,3193,15566689,Chimaoke,554,Spain,Male,66,8,0.0,2,1,1,116747.62,0
9213,9214,15672216,Uvarov,584,France,Female,40,4,82441.75,1,0,0,80852.11,0
2718,2719,15783444,Endrizzi,788,France,Female,39,3,135139.33,1,0,1,113086.08,0


In [4]:
col_to_drop=['RowNumber','CustomerId','Surname']
df1=df.drop(col_to_drop,axis=1)

In [5]:
col_for_dummy=['Geography']
df2=pd.get_dummies(df1,columns=col_for_dummy,drop_first=True)

In [6]:
df2['Gender'].replace({'Female':1,'Male':0},inplace=True)

In [7]:
#df2['Exited'].replace({0:'Not_exited',1:'Exited'},inplace=True)

In [8]:
df2['Exited'].value_counts(normalize=True)

0    0.7963
1    0.2037
Name: Exited, dtype: float64

In [9]:
df2.dtypes

CreditScore            int64
Gender                 int64
Age                    int64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Exited                 int64
Geography_Germany      uint8
Geography_Spain        uint8
dtype: object

In [10]:
df2.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,619,1,42,2,0.0,1,1,1,101348.88,1,0,0
1,608,1,41,1,83807.86,1,0,1,112542.58,0,0,1
2,502,1,42,8,159660.8,3,1,0,113931.57,1,0,0
3,699,1,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,1,43,2,125510.82,1,1,1,79084.1,0,0,1


**Scaling your data****

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
col_to_scale=['CreditScore','Age','Balance','EstimatedSalary']
df2[col_to_scale]=scaler.fit_transform(df2[col_to_scale])

**split data into train and split**

In [12]:
X=df2.drop('Exited',axis=1)
y=df2['Exited']

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=15,stratify=y)

**Logistic Regression**

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [15]:
def log_reg(X_train, y_train, X_test, y_test, weights):
    if weights==-1:
        model = LogisticRegression()
    else:
        model = LogisticRegression(class_weight={0:weights[0], 1:weights[1]})

    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print("Accuracy", acc, "\n")

    y_pred = model.predict(X_test)
    print("preds", y_pred[:5], "\n")

    cl_rep = classification_report(y_test,y_pred)
    print(cl_rep)

In [16]:
weights = -1 # pass -1 to use Logistics Regression without weights
log_reg(X_train, y_train, X_test, y_test, weights)

Accuracy 0.8125 

preds [0 0 0 0 0] 

              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1593
           1       0.62      0.21      0.31       407

    accuracy                           0.81      2000
   macro avg       0.72      0.59      0.60      2000
weighted avg       0.78      0.81      0.77      2000



In [17]:
# weights = [1, 1.5] # pass -1 to use Logistics Regression without weights
# log_reg(X_train, y_train, X_test, y_test, weights)

**Mitigating Skewdness of Data**

**Method1: Undersampling**

In [19]:
# Class count
count_class_0, count_class_1 = df1.Exited.value_counts()

# Divide by class
df_class_0 = df2[df2['Exited'] == 0]
df_class_1 = df2[df2['Exited'] == 1]

In [20]:
# Undersample 0-class and concat the DataFrames of both class
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.Exited.value_counts())

Random under-sampling:
0    2037
1    2037
Name: Exited, dtype: int64


In [21]:
X = df_test_under.drop('Exited',axis='columns')
y = df_test_under['Exited']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

**Applying Logistic Regression**

In [23]:
weights = -1 # pass -1 to use Logistics Regression without weights
log_reg(X_train, y_train, X_test, y_test, weights)

Accuracy 0.694478527607362 

preds [1 1 1 1 1] 

              precision    recall  f1-score   support

           0       0.69      0.71      0.70       408
           1       0.70      0.68      0.69       407

    accuracy                           0.69       815
   macro avg       0.69      0.69      0.69       815
weighted avg       0.69      0.69      0.69       815



**With undersampling: f1 score for minority class 1 improved to be 0.69 from 0.31**

**Method2: Oversampling**

In [25]:
# Oversample 1-class and concat the DataFrames of both classes

In [26]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.Exited.value_counts())

Random over-sampling:
0    7963
1    7963
Name: Exited, dtype: int64


In [27]:
X = df_test_over.drop('Exited',axis='columns')
y = df_test_over['Exited']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

**Logistic Regression**

In [28]:
weights = -1 # pass -1 to use Logistics Regression without weights
log_reg(X_train, y_train, X_test, y_test, weights)

Accuracy 0.7134337727558067 

preds [0 1 0 1 0] 

              precision    recall  f1-score   support

           0       0.71      0.72      0.71      1593
           1       0.71      0.71      0.71      1593

    accuracy                           0.71      3186
   macro avg       0.71      0.71      0.71      3186
weighted avg       0.71      0.71      0.71      3186



**With oversampling: f1 score for minority class 1 improved to be 0.71 from 0.31**

**Method3: SMOTE**

In [29]:
X=df2.drop('Exited',axis=1)
y=df2['Exited']

In [33]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

y_sm.value_counts()

1    7963
0    7963
Name: Exited, dtype: int64

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=15, stratify=y_sm)

**Logistic Regression**

In [35]:
weights = -1 # pass -1 to use Logistics Regression without weights
log_reg(X_train, y_train, X_test, y_test, weights)

Accuracy 0.7156308851224106 

preds [0 0 1 1 0] 

              precision    recall  f1-score   support

           0       0.71      0.73      0.72      1593
           1       0.72      0.71      0.71      1593

    accuracy                           0.72      3186
   macro avg       0.72      0.72      0.72      3186
weighted avg       0.72      0.72      0.72      3186



**With SMOTE: f1 score for minority class 1 improved to be 0.71 from 0.31**

**Method4: Use of Ensemble with undersampling**

In [37]:
df2.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [38]:
# Regain Original features and labels
X = df2.drop('Exited',axis='columns')
y = df2['Exited']

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [40]:
y_train.value_counts()

0    6370
1    1630
Name: Exited, dtype: int64

**model1 --> class1(1630) + class0(0, 1630)**

**model2 --> class1(1630) + class0(1630, 3260)**

**model3 --> class1(1630) + class0(3260, 6370)**


In [41]:
model = LogisticRegression()

df3 = X_train.copy()
df3['Exited'] = y_train

In [42]:
df3.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Exited
5710,0.856,0,0.216216,5,0.554265,2,0,0,0.339721,0,0,0
3745,0.852,1,0.256757,1,0.371163,2,1,1,0.980432,1,0,0
5429,0.664,1,0.405405,7,0.0,2,1,0,0.325318,0,0,0
551,0.648,0,0.391892,6,0.426077,1,1,1,0.010339,1,0,1
8967,0.97,0,0.094595,7,0.0,2,1,1,0.41723,0,0,0


In [43]:
df3_class0 = df3[df3.Exited==0]
df3_class1 = df3[df3.Exited==1]

In [53]:
def get_train_batch(df_majority, df_minority, start, end):
    df_train = pd.concat([df_majority[start:end], df_minority], axis=0)
    X_train = df_train.drop('Exited', axis='columns')
    y_train = df_train.Exited
    return X_train, y_train

In [54]:
X_train, y_train = get_train_batch(df3_class0, df3_class1, 0, 1630)

model1 = LogisticRegression()
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

In [55]:
X_train, y_train = get_train_batch(df3_class0, df3_class1, 1630, 3260)

model2 = LogisticRegression()
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

In [56]:
X_train, y_train = get_train_batch(df3_class0, df3_class1, 3260, 6370)

model3 = LogisticRegression()
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)

In [57]:
len(y_pred1)

2000

In [58]:
y_pred_final = y_pred1.copy()
for i in range(len(y_pred1)):
    n_ones = y_pred1[i] + y_pred2[i] + y_pred3[i]
    if n_ones>1:
        y_pred_final[i] = 1
    else:
        y_pred_final[i] = 0

In [59]:
cl_rep = classification_report(y_test, y_pred_final)
print(cl_rep)

              precision    recall  f1-score   support

           0       0.89      0.74      0.80      1593
           1       0.38      0.63      0.47       407

    accuracy                           0.71      2000
   macro avg       0.63      0.68      0.64      2000
weighted avg       0.78      0.71      0.74      2000

