# Lab | Handling Data Imbalance in Classification Models

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.utils import resample
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

In [2]:
churnData = pd.read_csv('Customer-Churn.csv')

In [3]:
print(churnData.shape)
churnData.head()

(7043, 16)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [4]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

#### TotalCharges column

In [5]:
churnData['TotalCharges'].value_counts()

          11
20.2      11
19.75      9
20.05      8
19.9       8
          ..
6849.4     1
692.35     1
130.15     1
3211.9     1
6844.5     1
Name: TotalCharges, Length: 6531, dtype: int64

As I can observe, I have some rows without any value, so as TotalCharges is mainly the same as MonthlyCharges x Tenure, I will do the calculation

In [6]:
for i in range(len(churnData['TotalCharges'])):
    if churnData['TotalCharges'][i] == " ": 
        churnData['TotalCharges'][i] = churnData['MonthlyCharges'][i] * churnData['tenure'][i]
        

Now, I'll convert it in a numerical column

In [7]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'])

#### Checking for nulls

In [8]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

#### Numerical Features

In [9]:
features = churnData.select_dtypes(np.number)
features.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,1,29.85,29.85
1,0,34,56.95,1889.5
2,0,2,53.85,108.15
3,0,45,42.3,1840.75
4,0,2,70.7,151.65


Scaling the features either by using standard scaler

In [10]:
transformer = MinMaxScaler().fit(features)
features_scaled = pd.DataFrame(transformer.transform(features), columns=features.columns)
features_scaled.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0.0,0.013889,0.115423,0.003437
1,0.0,0.472222,0.385075,0.217564
2,0.0,0.027778,0.354229,0.012453
3,0.0,0.625,0.239303,0.211951
4,0.0,0.027778,0.521891,0.017462


Converting the target into a numerical 

In [11]:
churnData['Churn'] = churnData['Churn'].map({'No': 0, 'Yes': 1}).astype(int)

In [12]:
labels = pd.DataFrame(churnData['Churn'].to_numpy(), columns = ['labels'])
labels.head()

Unnamed: 0,labels
0,0
1,0
2,1
3,0
4,1


#### Splitting data y LR model

In [13]:
def LRTest(features, labels):
    transformer = MinMaxScaler().fit(features)
    features_scaled = pd.DataFrame(transformer.transform(features), columns=features.columns)
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, random_state=0)
    LR = LogisticRegression()
    LR.fit(X_train, y_train)
    pred = LR.predict(X_test)
    print("LR score:", LR.score(X_test, y_test))
    print("recall: ",recall_score(y_test,pred))
    print("\n Confusion matrix: \n",confusion_matrix(y_test,pred))

In [14]:
LRTest(features, labels)

LR score: 0.7830777967064169
recall:  0.4535637149028078

 Confusion matrix: 
 [[1169  129]
 [ 253  210]]


With LR I'm detecting the 45% of the possitive churns (recall)

In [15]:
labels.value_counts()

labels
0         5174
1         1869
dtype: int64

As the data is unbalanced, I will use resampling strategies

In [16]:
category_0 = churnData[churnData['Churn'] == 0]
category_1 = churnData[churnData['Churn'] == 1]

### Upsampling

In [17]:
category_1_oversampled = resample(category_1, 
                                   replace=True,
                                   n_samples = len(category_0))

Checking it is correct

In [18]:
print(category_0.shape)
print(category_1_oversampled.shape)

(5174, 16)
(5174, 16)


In [19]:
data_upsampled = pd.concat([category_0, category_1_oversampled], axis=0)

##### Trying LRModel

In [20]:
features = data_upsampled[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
labels = pd.DataFrame(data_upsampled['Churn'].to_numpy(), columns = ['labels'])

In [21]:
print("LR model with Upsampling \n")
LRTest(features, labels)

LR model with Upsampling 

LR score: 0.7386934673366834
recall:  0.7518910741301059

 Confusion matrix: 
 [[917 348]
 [328 994]]


Using Downsampling I'm detecting almost a 30% more of the possitive churns (recall)

### Downsampling

In [22]:
category_0_undersampled = resample(category_0, 
                                   replace=False, #Downsampling
                                   n_samples = len(category_1))

Checking it is correct

In [23]:
print(category_0_undersampled.shape)
print(category_1.shape)

(1869, 16)
(1869, 16)


In [24]:
data_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)

##### Trying LRModel

In [25]:
features = data_downsampled[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
labels = pd.DataFrame(data_downsampled['Churn'].to_numpy(), columns = ['labels'])

In [26]:
print("LR model with Downsampling \n")
LRTest(features, labels)

LR model with Downsampling 

LR score: 0.7272727272727273
recall:  0.7288135593220338

 Confusion matrix: 
 [[336 127]
 [128 344]]


Using Downsampling I'm detecting almost a 30% more of the possitive churns (recall)