In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [30]:
#load dataset
df = pd.read_csv('handlingImbalance.csv', index_col = 'id')

In [31]:
df.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
167647,Male,22,1,7.0,1,< 1 Year,No,2630.0,152.0,16,0
17163,Male,42,1,28.0,0,1-2 Year,Yes,43327.0,26.0,135,0
32023,Female,66,1,33.0,0,1-2 Year,Yes,35841.0,124.0,253,0
87447,Female,22,1,33.0,0,< 1 Year,No,27645.0,152.0,69,0
501933,Male,28,1,46.0,1,< 1 Year,No,29023.0,152.0,211,0


In [32]:
df.shape

(347154, 11)

In [33]:
df['Response'].value_counts()

0    319553
1     27601
Name: Response, dtype: int64

In [34]:
df.describe()

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,347154.0,347154.0,347154.0,347154.0,347154.0,347154.0,347154.0,347154.0
mean,38.083303,0.997969,26.374168,0.538248,30592.931647,114.224978,154.234959,0.079507
std,15.533697,0.045019,13.341472,0.498536,16991.98006,53.606112,83.75364,0.270528
min,20.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0
25%,24.0,1.0,15.0,0.0,24485.25,55.0,81.0,0.0
50%,34.0,1.0,28.0,1.0,31545.0,152.0,154.0,0.0
75%,49.0,1.0,36.0,1.0,39232.0,152.0,227.0,0.0
max,85.0,1.0,52.0,1.0,540165.0,163.0,299.0,1.0


In [35]:
#Checking null values
df.isnull().sum()

Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 347154 entries, 167647 to 401019
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                347154 non-null  object 
 1   Age                   347154 non-null  int64  
 2   Driving_License       347154 non-null  int64  
 3   Region_Code           347154 non-null  float64
 4   Previously_Insured    347154 non-null  int64  
 5   Vehicle_Age           347154 non-null  object 
 6   Vehicle_Damage        347154 non-null  object 
 7   Annual_Premium        347154 non-null  float64
 8   Policy_Sales_Channel  347154 non-null  float64
 9   Vintage               347154 non-null  int64  
 10  Response              347154 non-null  int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 31.8+ MB


In [37]:
#Independent and dependent variables
X = df.drop('Response', axis = 1)
y = df.Response

In [38]:
#Splitting train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [39]:
print(X_train.shape)
print(X_test.shape)

(277723, 10)
(69431, 10)


In [40]:
print(y_train.value_counts())
print(y_test.value_counts())

0    255674
1     22049
Name: Response, dtype: int64
0    63879
1     5552
Name: Response, dtype: int64


In [41]:
#Percentage of minority class 1
22049/(22049+255674)

0.07939205611346557

In [42]:
le = LabelEncoder()

In [43]:
X_train['Gender'] = le.fit_transform(X_train['Gender'])
X_train['Vehicle_Age'] = le.fit_transform(X_train['Vehicle_Age'])
X_train['Vehicle_Damage'] = le.fit_transform(X_train['Vehicle_Damage'])

X_test['Gender'] = le.fit_transform(X_test['Gender'])
X_test['Vehicle_Age'] = le.fit_transform(X_test['Vehicle_Age'])
X_test['Vehicle_Damage'] = le.fit_transform(X_test['Vehicle_Damage'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Gender'] = le.fit_transform(X_train['Gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Vehicle_Age'] = le.fit_transform(X_train['Vehicle_Age'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Vehicle_Damage'] = le.fit_transform(X_train['Vehicle_Damage'])
A value is 

In [44]:
X_train.head(2)

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
325550,1,24,1,28.0,0,1,1,33224.0,157.0,198
85395,1,55,1,48.0,0,0,1,2630.0,124.0,271


In [45]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
y_train = y_train.values
y_test = y_test.values

# Without any sampling using logistic regression

In [49]:
lg = LogisticRegression()

In [50]:
lg.fit(X_train, y_train)

LogisticRegression()

In [51]:
y_pred=lg.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[63878     1]
 [ 5552     0]]
0.9200213161268022
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     63879
           1       0.00      0.00      0.00      5552

    accuracy                           0.92     69431
   macro avg       0.46      0.50      0.48     69431
weighted avg       0.85      0.92      0.88     69431



# Without any sampling using RandomForest

In [54]:
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

RandomForestClassifier()

In [55]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[62933   946]
 [ 5078   474]]
0.9132376027999021
              precision    recall  f1-score   support

           0       0.93      0.99      0.95     63879
           1       0.33      0.09      0.14      5552

    accuracy                           0.91     69431
   macro avg       0.63      0.54      0.55     69431
weighted avg       0.88      0.91      0.89     69431



# Under Sampling 

In [56]:
from collections import Counter
Counter(y_train)

Counter({0: 255674, 1: 22049})

In [57]:
#Under sampling the data using nearmiss, it does sampling such that at final 80% of majority class
#will equal to minority class
from imblearn.under_sampling import NearMiss
nm = NearMiss(0.8)
X_train_nm, y_train_nm = nm.fit_sample(X_train, y_train)



In [58]:
#After under sampling count of majority and minority classes
Counter(y_train_nm)

Counter({0: 27561, 1: 22049})

In [59]:
classifier_nm = RandomForestClassifier(bootstrap = True, criterion= 'entropy', max_depth = 6, max_features = None)
classifier_nm.fit(X_train_nm, y_train_nm)

RandomForestClassifier(criterion='entropy', max_depth=6, max_features=None)

In [60]:
y_pred=classifier_nm.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[18023 45856]
 [ 2656  2896]]
0.3012919301176708
              precision    recall  f1-score   support

           0       0.87      0.28      0.43     63879
           1       0.06      0.52      0.11      5552

    accuracy                           0.30     69431
   macro avg       0.47      0.40      0.27     69431
weighted avg       0.81      0.30      0.40     69431



# Over Sampling

In [61]:
#Over sampling using random over sampling, this will increase the count of minority class to 80% of majority class
from imblearn.over_sampling import RandomOverSampler

In [62]:
ro = RandomOverSampler(0.8)
X_train_ro, y_train_ro = ro.fit_sample(X_train, y_train)



In [63]:
Counter(y_train_ro)

Counter({0: 255674, 1: 204539})

In [65]:
# rf = RandomForestClassifier(random_state = 1)

In [66]:
# param_dist = {'max_depth':[5,8,10,12],
#              'bootstrap':[True,False],
#              'max_features':['auto','sqrt','log2',None],
#              'criterion':['gini','entropy']}

# classifier_ro = GridSearchCV(rf,cv=10,param_grid=param_dist,n_jobs=-1)

# classifier_ro.fit(X_train_ro, y_train_ro)

In [67]:
# classifier_ro.best_params_

In [68]:
# rf.set_params(bootstrap = True, criterion= 'entropy', max_depth = 6, max_features = None)

In [64]:
classifier_ro = RandomForestClassifier(bootstrap = True, criterion = 'entropy', max_depth = 6, max_features = None)
classifier_ro.fit(X_train_ro, y_train_ro)

RandomForestClassifier(criterion='entropy', max_depth=6, max_features=None)

In [69]:
y_pred=classifier_ro.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[49176 14703]
 [  566  4986]]
0.7800838242283706
              precision    recall  f1-score   support

           0       0.99      0.77      0.87     63879
           1       0.25      0.90      0.40      5552

    accuracy                           0.78     69431
   macro avg       0.62      0.83      0.63     69431
weighted avg       0.93      0.78      0.83     69431



# Using SMOTETomek

In [70]:
from imblearn.combine import SMOTETomek

In [72]:
st = SMOTETomek(0.8)
X_train_st, y_train_st = st.fit_sample(X_train, y_train)

In [74]:
Counter(y_train)

Counter({0: 255674, 1: 22049})

In [73]:
Counter(y_train_st)

Counter({0: 251833, 1: 200698})

In [75]:
classifier_st = RandomForestClassifier(bootstrap = True, criterion = 'entropy', max_depth = 6, max_features = None)
classifier_st.fit(X_train_st, y_train_st)

RandomForestClassifier(criterion='entropy', max_depth=6, max_features=None)

In [76]:
y_pred=classifier_st.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[49008 14871]
 [  645  4907]]
0.7765263354985525
              precision    recall  f1-score   support

           0       0.99      0.77      0.86     63879
           1       0.25      0.88      0.39      5552

    accuracy                           0.78     69431
   macro avg       0.62      0.83      0.63     69431
weighted avg       0.93      0.78      0.83     69431



# Conclusion

In [None]:
Without any sampling model with logisticregression or randomeforestclassifier tends to give the accuracy which is
almost equal to percentage of majority class, it is unable to learn data about the minority class and as we know 
in case of imbalance data we should not go with accuracy and have to go with other evaluation techniques,
also if we observe precison, recall and other in case of minority class is performing very bad(results are 0)

In [None]:
With under sampling, as we loose data, if we observe we got very less accuracy, precison, recall score in this case

In [None]:
With over sampling it's much better, we got some good accuracy, precision and recall scores