## Lab | Random Forests

**Instructions**

 - Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.
 - Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [55]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
churnData = pd.read_csv("Customer-Churn.csv")
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [3]:
churnData.shape

(7043, 16)

In [4]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [5]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
churnData["TotalCharges"] = churnData["TotalCharges"].apply(lambda x: np.nan if x == " " else x)

In [7]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [8]:
# due to small numbers of null values, dropping a null values from Total Charges column
churnData.dropna(subset=['TotalCharges'], inplace=True)

In [9]:
churnData.shape

(7032, 16)

In [10]:
churnData.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [11]:
churnData.columns = [churnData.columns[i].lower() for i in range(len(churnData.columns))]

In [12]:
churnData.columns

Index(['gender', 'seniorcitizen', 'partner', 'dependents', 'tenure',
       'phoneservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection',
       'techsupport', 'streamingtv', 'streamingmovies', 'contract',
       'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [13]:
# Converting total charges column to numeric
churnData['totalcharges']= pd.to_numeric(churnData['totalcharges'], errors = 'coerce' )

In [14]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   seniorcitizen     7032 non-null   int64  
 2   partner           7032 non-null   object 
 3   dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   phoneservice      7032 non-null   object 
 6   onlinesecurity    7032 non-null   object 
 7   onlinebackup      7032 non-null   object 
 8   deviceprotection  7032 non-null   object 
 9   techsupport       7032 non-null   object 
 10  streamingtv       7032 non-null   object 
 11  streamingmovies   7032 non-null   object 
 12  contract          7032 non-null   object 
 13  monthlycharges    7032 non-null   float64
 14  totalcharges      7032 non-null   float64
 15  churn             7032 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

 - **Encoding categorical**

In [15]:
cat = churnData[churnData.select_dtypes("object").columns]
cat.head()

Unnamed: 0,gender,partner,dependents,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,churn
0,Female,Yes,No,No,No,Yes,No,No,No,No,Month-to-month,No
1,Male,No,No,Yes,Yes,No,Yes,No,No,No,One year,No
2,Male,No,No,Yes,Yes,Yes,No,No,No,No,Month-to-month,Yes
3,Male,No,No,No,Yes,No,Yes,Yes,No,No,One year,No
4,Female,No,No,Yes,No,No,No,No,No,No,Month-to-month,Yes


In [16]:
cat.columns

Index(['gender', 'partner', 'dependents', 'phoneservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract', 'churn'],
      dtype='object')

In [17]:
cat['gender'].value_counts()

Male      3549
Female    3483
Name: gender, dtype: int64

In [18]:
cat['partner'].value_counts()

No     3639
Yes    3393
Name: partner, dtype: int64

In [19]:
cat['dependents'].value_counts()

No     4933
Yes    2099
Name: dependents, dtype: int64

In [20]:
cat['phoneservice'].value_counts()

Yes    6352
No      680
Name: phoneservice, dtype: int64

In [21]:
cat['onlinesecurity'].value_counts()

No                     3497
Yes                    2015
No internet service    1520
Name: onlinesecurity, dtype: int64

In [22]:
cat['onlinebackup'].value_counts()

No                     3087
Yes                    2425
No internet service    1520
Name: onlinebackup, dtype: int64

In [23]:
cat['deviceprotection'].value_counts()

No                     3094
Yes                    2418
No internet service    1520
Name: deviceprotection, dtype: int64

In [24]:
cat['techsupport'].value_counts()

No                     3472
Yes                    2040
No internet service    1520
Name: techsupport, dtype: int64

In [25]:
cat['streamingtv'].value_counts()

No                     2809
Yes                    2703
No internet service    1520
Name: streamingtv, dtype: int64

In [26]:
cat['streamingmovies'].value_counts()

No                     2781
Yes                    2731
No internet service    1520
Name: streamingmovies, dtype: int64

In [27]:
cat['contract'].value_counts()

Month-to-month    3875
Two year          1685
One year          1472
Name: contract, dtype: int64

In [28]:
cat['churn'].value_counts()

No     5163
Yes    1869
Name: churn, dtype: int64

**Below columns are type of binary values yes and no, so converting those columns into 0 and 1**
 - partner
 - dependents
 - phoneservice

In [29]:
churnData['partner'] = churnData['partner'].apply(lambda x: 1 if x == "Yes" else 0)

In [30]:
churnData['partner'].value_counts()

0    3639
1    3393
Name: partner, dtype: int64

In [31]:
churnData['dependents'] = churnData['dependents'].apply(lambda x: 1 if x == "Yes" else 0)

In [32]:
churnData['dependents'].value_counts()

0    4933
1    2099
Name: dependents, dtype: int64

In [33]:
churnData['phoneservice'] = churnData['phoneservice'].apply(lambda x: 1 if x == "Yes" else 0)

In [34]:
churnData['phoneservice'].value_counts()

1    6352
0     680
Name: phoneservice, dtype: int64

**Converting other categorical columns into numerical using get dummies**

In [35]:
cat_dummies = cat[['onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract']]

In [36]:
cat_dummies.head(20)

Unnamed: 0,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract
0,No,Yes,No,No,No,No,Month-to-month
1,Yes,No,Yes,No,No,No,One year
2,Yes,Yes,No,No,No,No,Month-to-month
3,Yes,No,Yes,Yes,No,No,One year
4,No,No,No,No,No,No,Month-to-month
5,No,No,Yes,No,Yes,Yes,Month-to-month
6,No,Yes,No,No,Yes,No,Month-to-month
7,Yes,No,No,No,No,No,Month-to-month
8,No,No,Yes,Yes,Yes,Yes,Month-to-month
9,Yes,Yes,No,No,No,No,One year


In [37]:
cat_dummies = pd.get_dummies(cat_dummies)

In [38]:
cat_dummies.head(20)

Unnamed: 0,onlinesecurity_No,onlinesecurity_No internet service,onlinesecurity_Yes,onlinebackup_No,onlinebackup_No internet service,onlinebackup_Yes,deviceprotection_No,deviceprotection_No internet service,deviceprotection_Yes,techsupport_No,...,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No,streamingmovies_No internet service,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year
0,1,0,0,0,0,1,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
1,0,0,1,1,0,0,0,0,1,1,...,0,1,0,0,1,0,0,0,1,0
2,0,0,1,0,0,1,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
3,0,0,1,1,0,0,0,0,1,0,...,1,1,0,0,1,0,0,0,1,0
4,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
5,1,0,0,1,0,0,0,0,1,1,...,0,0,0,1,0,0,1,1,0,0
6,1,0,0,0,0,1,1,0,0,1,...,0,0,0,1,1,0,0,1,0,0
7,0,0,1,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
8,1,0,0,1,0,0,0,0,1,0,...,1,0,0,1,0,0,1,1,0,0
9,0,0,1,0,0,1,1,0,0,1,...,0,1,0,0,1,0,0,0,1,0


In [39]:
churnData['gender'].value_counts()

Male      3549
Female    3483
Name: gender, dtype: int64

In [40]:
# Converting male and female into 0 and 1
churnData['gender'] = churnData['gender'].apply(lambda x: 1 if x == "Male" else 0)

In [41]:
churnData['gender'].value_counts()

1    3549
0    3483
Name: gender, dtype: int64

In [42]:
# removing original calumns from which were transferred into gat dummies
churnData = churnData.drop(['onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract'], axis =1)

In [43]:
churnData.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,monthlycharges,totalcharges,churn
0,0,0,1,0,1,0,29.85,29.85,No
1,1,0,0,0,34,1,56.95,1889.5,No
2,1,0,0,0,2,1,53.85,108.15,Yes
3,1,0,0,0,45,0,42.3,1840.75,No
4,0,0,0,0,2,1,70.7,151.65,Yes


In [44]:
# concatinating all data into one
churnData_final = pd.concat([churnData, cat_dummies], axis=1)

In [45]:
churnData_final.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,monthlycharges,totalcharges,churn,onlinesecurity_No,...,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No,streamingmovies_No internet service,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year
0,0,0,1,0,1,0,29.85,29.85,No,1,...,0,1,0,0,1,0,0,1,0,0
1,1,0,0,0,34,1,56.95,1889.5,No,0,...,0,1,0,0,1,0,0,0,1,0
2,1,0,0,0,2,1,53.85,108.15,Yes,0,...,0,1,0,0,1,0,0,1,0,0
3,1,0,0,0,45,0,42.3,1840.75,No,0,...,1,1,0,0,1,0,0,0,1,0
4,0,0,0,0,2,1,70.7,151.65,Yes,1,...,0,1,0,0,1,0,0,1,0,0


In [46]:
# X-y split

X = churnData_final.drop("churn", axis=1)
y = churnData_final[["churn"]]

In [47]:
# Checking target columns values
y.value_counts()

churn
No       5163
Yes      1869
dtype: int64

### 1. Apply SMOTE for upsampling the data

In [48]:
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

churn
No       5163
Yes      5163
dtype: int64

In [50]:
# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=21)

### Build Random Forest Model with hyperparameters

In [51]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# criterion
criteria = ['gini','entropy']

In [65]:
# Create the param grid - creating a pipeline for grid search cv
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [54]:
# Building a random forest model
rf_Model = RandomForestClassifier()

In [66]:
# Creating Grid Search CV
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4)
rf_Grid.fit(X_train, y_train)
rf_Grid.best_params_ 

Fitting 3 folds for each of 320 candidates, totalling 960 fits


{'bootstrap': False,
 'max_depth': 4,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [70]:
rf_Grid.best_score_

0.8012097100032068

In [74]:
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(random_state=0, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

0.8384987893462471
