# <font color='blue'>Telecom Churn Case Study</font>
* Institution: IIIT, Bangalore and UpGrad
* Course: PG Diploma in Machine Lerning and AI March 2018
* Date: 14-Aug-2018
* Submitted by:
    1. Pandinath Siddineni (ID- APFE187000194)
    2. AKNR Chandra Sekhar (ID- APFE187000315)
    3. Brajesh Kumar       (ID- APFE187000149)
    4. Shweta Tiwari
-----------------------------------

# <font color='blue'>PART 3: LASSO & RANDOM FOREST</font>

In [1]:
import os.path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

pd.options.display.float_format = '{:.2f}'.format

In [2]:
# Load clean telecom data file
master_df = pd.read_csv('telecom_churn_data_clean.csv', low_memory=False)
master_df.head()

Unnamed: 0,mobile_number,arpu_6,arpu_7,arpu_8,onnet_mou_6,onnet_mou_7,onnet_mou_8,offnet_mou_6,offnet_mou_7,offnet_mou_8,...,fb7_1.0,fb8_0.0,fb8_1.0,total_rech_data_amt_6,total_rech_data_amt_7,total_rech_data_amt_8,churn,rech_days_left_6,rech_days_left_7,rech_days_left_8
0,7000701601,1069.18,1349.85,3171.48,57.84,54.68,52.29,453.43,567.16,325.91,...,0,0,0,0.0,0.0,0.0,1,3.0,6.0,5.0
1,7001524846,378.72,492.22,137.36,413.69,351.03,35.08,94.66,80.63,136.48,...,1,0,1,0.0,354.0,207.0,0,5.0,0.0,1.0
2,7002124215,514.45,597.75,637.76,102.41,132.11,85.14,757.93,896.68,983.39,...,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0
3,7000887461,74.35,193.9,366.97,48.96,50.66,33.58,85.41,89.36,205.89,...,1,0,1,0.0,712.0,540.0,0,12.0,24.0,7.0
4,7000149764,977.02,2362.83,409.23,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,5285.0,20424.0,455.0,0,0.0,1.0,5.0


In [3]:
print('Dataframe Shape: ', master_df.shape)
print("Dataframe Info: \n"); master_df.info()

Dataframe Shape:  (28504, 144)
Dataframe Info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28504 entries, 0 to 28503
Columns: 144 entries, mobile_number to rech_days_left_8
dtypes: float64(105), int64(39)
memory usage: 31.3 MB


In [4]:
# Drop MemberID/Phone-number
telecom = master_df.drop(['mobile_number'], axis=1)

# Create X (independent variable) & y (dependent variable) 
df_telecom = telecom.drop(['churn'], axis=1)
X = telecom.drop(['churn'], axis=1)
y = telecom['churn']

In [5]:
X.shape

(28504, 142)

### Data Standardization/Normalization

In [6]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

### Split Data into Train & Test

In [7]:
# Split in train & Test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=1)

In [8]:
print("X_train Dataframe Shape {}".format(X_train.shape))
print("X_test Dataframe Shape {}".format(X_test.shape))

y_train_imb = (y_train != 0).sum()/(y_train == 0).sum()
y_test_imb = (y_test != 0).sum()/(y_test == 0).sum()
print("Imbalance in Train Data: {}".format(y_train_imb))
print("Imbalance in Test Data: {}".format(y_test_imb))

X_train Dataframe Shape (19952, 142)
X_test Dataframe Shape (8552, 142)
Imbalance in Train Data: 0.05941698083151914
Imbalance in Test Data: 0.059071207430340555


### Balance data set by oversampling

In [9]:
# (Training) Balance Data-Set --- SMOT
from imblearn.over_sampling import SMOTE

sm = SMOTE(kind = "regular")
X_tr,y_tr = sm.fit_sample(X_train,y_train)

In [10]:
print("X_tr Dataframe Shape {}".format(X_tr.shape))
print("y_tr Dataframe Shape {}".format(y_tr.shape))

data_imbalance = (y_tr != 0).sum()/(y_tr == 0).sum()
print("Imbalance in Train Data: {}".format(data_imbalance))

X_tr Dataframe Shape (37666, 142)
y_tr Dataframe Shape (37666,)
Imbalance in Train Data: 1.0


### Feature reduction using LASSO

In [13]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
 
lsvc = LinearSVC(C=0.001, penalty="l1", dual=False).fit(X_tr, y_tr)
model = SelectFromModel(lsvc, prefit=True)
X_lasso = model.transform(X_tr)
pos = model.get_support(indices=True)
 ### Feature reduction using RFE
print(X_lasso.shape)
print(pos)

(37666, 43)
[  0   1   3  10  13  14  20  23  29  34  39  40  47  49  52  53  58  65
  78  79  80  83  91  92 101 102 104 107 108 109 110 111 113 116 118 120
 121 125 132 135 139 140 141]


  if np.issubdtype(mask.dtype, np.int):


In [14]:

#feature vector for decision tree#feature 
lasso_features = list(df_telecom.columns[pos])
print("Features identified by LASSO for model buidling: ", lasso_features)

Features identified by LASSO for model buidling:  ['arpu_6', 'arpu_7', 'onnet_mou_6', 'roam_ic_mou_7', 'roam_og_mou_7', 'roam_og_mou_8', 'loc_og_t2m_mou_8', 'loc_og_t2f_mou_8', 'loc_og_mou_8', 'std_og_t2m_mou_7', 'std_og_mou_6', 'std_og_mou_7', 'spl_og_mou_8', 'og_others_7', 'total_og_mou_7', 'total_og_mou_8', 'loc_ic_t2m_mou_7', 'loc_ic_mou_8', 'total_ic_mou_6', 'total_ic_mou_7', 'total_ic_mou_8', 'spl_ic_mou_8', 'total_rech_num_7', 'total_rech_num_8', 'last_day_rch_amt_8', 'vol_2g_mb_6', 'vol_2g_mb_8', 'vol_3g_mb_8', 'monthly_2g_6', 'monthly_2g_7', 'monthly_2g_8', 'sachet_2g_6', 'sachet_2g_8', 'monthly_3g_8', 'sachet_3g_7', 'aon', 'aug_vbc_3g', 'night6_1.0', 'fb7_0.0', 'fb8_1.0', 'rech_days_left_6', 'rech_days_left_7', 'rech_days_left_8']


In [15]:
X_train = X_lasso
y_train = y_tr

In [16]:
print ("Feature space holds %d observations and %d features" % X_train.shape)
print ("Unique target labels:", np.unique(y_train))

Feature space holds 37666 observations and 43 features
Unique target labels: [0 1]


### Random Forest with default hyperparameter

In [17]:
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

# Running the random forest with default parameters.
rfc = RandomForestClassifier()

  from numpy.core.umath_tests import inner1d


In [18]:
# fit
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
# Making predictions
X_test = pd.DataFrame(data=X_test).iloc[:, pos]

predictions = rfc.predict(X_test)

In [20]:
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
# Let's check the report of our default model
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.97      0.97      0.97      8075
          1       0.48      0.49      0.48       477

avg / total       0.94      0.94      0.94      8552



In [21]:
# Printing confusion matrix
print(confusion_matrix(y_test, predictions))

[[7825  250]
 [ 245  232]]


In [22]:
print(accuracy_score(y_test, predictions))

0.9421188026192704


### Hyperparameter Tuning

NOTE: Hyperparameter Tunning is commented as it takes heavy computing power and time.

### Tuning max_depth

In [25]:
# # GridSearchCV to find optimal n_estimators
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'max_depth': range(2, 20, 5)}

# # instantiate the model
# rf = RandomForestClassifier()


# # fit tree on training data
# rf = GridSearchCV(rf, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# rf.fit(X_train, y_train)

In [None]:
# # scores of GridSearch CV
# scores = rf.cv_results_
# pd.DataFrame(scores).head()

In [None]:
# # plotting accuracies with max_depth
# plt.figure()
# plt.plot(scores["param_max_depth"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_max_depth"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("max_depth")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

### Tuning n_estimators

In [None]:
# # GridSearchCV to find optimal n_estimators
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'n_estimators': range(100, 1500, 400)}

# # instantiate the model (note we are specifying a max_depth)
# rf = RandomForestClassifier(max_depth=4)


# # fit tree on training data
# rf = GridSearchCV(rf, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# rf.fit(X_train, y_train)

In [None]:
# # scores of GridSearch CV
# scores = rf.cv_results_
# pd.DataFrame(scores).head()

In [None]:
# # plotting accuracies with n_estimators
# plt.figure()
# plt.plot(scores["param_n_estimators"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_n_estimators"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("n_estimators")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()


### Tuning max_features

In [None]:
# # GridSearchCV to find optimal max_features
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'max_features': [4, 8, 14, 20, 24,28,32,36,40,44,48,52,56]}

# # instantiate the model
# rf = RandomForestClassifier(max_depth=4)


# # fit tree on training data
# rf = GridSearchCV(rf, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# rf.fit(X_train, y_train)

In [None]:
# # scores of GridSearch CV
# scores = rf.cv_results_
# pd.DataFrame(scores).head()

In [None]:
# # plotting accuracies with max_features
# plt.figure()
# plt.plot(scores["param_max_features"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_max_features"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("max_features")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

### Tuning min_samples_leaf

In [None]:
# # GridSearchCV to find optimal min_samples_leaf
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'min_samples_leaf': range(100, 400, 50)}

# # instantiate the model
# rf = RandomForestClassifier()


# # fit tree on training data
# rf = GridSearchCV(rf, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# rf.fit(X_train, y_train)

In [None]:
# # scores of GridSearch CV
# scores = rf.cv_results_
# pd.DataFrame(scores).head()

In [None]:
# # plotting accuracies with min_samples_leaf
# plt.figure()
# plt.plot(scores["param_min_samples_leaf"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_min_samples_leaf"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("min_samples_leaf")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

### Tuning min_samples_split

In [None]:
# # GridSearchCV to find optimal min_samples_split
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'min_samples_split': range(200, 500, 50)}

# # instantiate the model
# rf = RandomForestClassifier()


# # fit tree on training data
# rf = GridSearchCV(rf, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# rf.fit(X_train, y_train)

In [None]:
# # scores of GridSearch CV
# scores = rf.cv_results_
# pd.DataFrame(scores).head()

In [None]:
# # plotting accuracies with min_samples_split
# plt.figure()
# plt.plot(scores["param_min_samples_split"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_min_samples_split"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("min_samples_split")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

### Grid Search to Find Optimal Hyperparameters

In [None]:
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'max_depth': [4,8,10],
#     'min_samples_leaf': range(100, 400, 200),
#     'min_samples_split': range(200, 500, 200),
#     'n_estimators': [100,200, 300], 
#     'max_features': [5, 10,20,30,40,50]
# }
# # Create a based model
# rf = RandomForestClassifier()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1,verbose = 1)

In [None]:
# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

In [None]:
# # printing the optimal accuracy score and hyperparameters
# print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

In [None]:
# # model with the best hyperparameters
# from sklearn.ensemble import RandomForestClassifier
# rfc = RandomForestClassifier(bootstrap=True,
#                              max_depth=10,
#                              min_samples_leaf=100, 
#                              min_samples_split=200,
#                              max_features=10,
#                              n_estimators=100)

In [None]:
# # fit
# rfc.fit(X_train,y_train)

In [None]:
# # predict
# predictions = rfc.predict(X_test)

In [None]:
# # evaluation metrics
# from sklearn.metrics import classification_report,confusion_matrix
# print(classification_report(y_test,predictions))
# print(confusion_matrix(y_test,predictions))

#### <font color='blue'>SUMMARY PART 3: LASSO & RANDOM FOREST</font>
OBSERVATIONS
1. Getting 94.0% accuracy 
2. Confusion matix clearly improved a lot, false positives still exist but reduced.
