# <font color='blue'>Telecom Churn Case Study</font>
* Institution: IIIT, Bangalore and UpGrad
* Course: PG Diploma in Machine Lerning and AI March 2018
* Date: 14-Aug-2018
* Submitted by:
    1. Pandinath Siddineni (ID- APFE187000194)
    2. AKNR Chandra Sekhar (ID- APFE187000315)
    3. Brajesh Kumar       (ID- APFE187000149)
    4. Shweta Tiwari
-----------------------------------

# <font color='blue'>PART 3: LASSO & DECISSION TREE</font>

In [1]:
import os.path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

pd.options.display.float_format = '{:.2f}'.format

In [2]:
# Load clean telecom data file
master_df = pd.read_csv('telecom_churn_data_clean.csv', low_memory=False)
master_df.head()

Unnamed: 0,mobile_number,arpu_6,arpu_7,arpu_8,onnet_mou_6,onnet_mou_7,onnet_mou_8,offnet_mou_6,offnet_mou_7,offnet_mou_8,...,fb7_1.0,fb8_0.0,fb8_1.0,total_rech_data_amt_6,total_rech_data_amt_7,total_rech_data_amt_8,churn,rech_days_left_6,rech_days_left_7,rech_days_left_8
0,7000701601,1069.18,1349.85,3171.48,57.84,54.68,52.29,453.43,567.16,325.91,...,0,0,0,0.0,0.0,0.0,1,3.0,6.0,5.0
1,7001524846,378.72,492.22,137.36,413.69,351.03,35.08,94.66,80.63,136.48,...,1,0,1,0.0,354.0,207.0,0,5.0,0.0,1.0
2,7002124215,514.45,597.75,637.76,102.41,132.11,85.14,757.93,896.68,983.39,...,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0
3,7000887461,74.35,193.9,366.97,48.96,50.66,33.58,85.41,89.36,205.89,...,1,0,1,0.0,712.0,540.0,0,12.0,24.0,7.0
4,7000149764,977.02,2362.83,409.23,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,5285.0,20424.0,455.0,0,0.0,1.0,5.0


In [3]:
print('Dataframe Shape: ', master_df.shape)
print("Dataframe Info: \n"); master_df.info()

Dataframe Shape:  (28504, 144)
Dataframe Info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28504 entries, 0 to 28503
Columns: 144 entries, mobile_number to rech_days_left_8
dtypes: float64(105), int64(39)
memory usage: 31.3 MB


In [4]:
# Drop MemberID/Phone-number
telecom = master_df.drop(['mobile_number'], axis=1)

# Create X (independent variable) & y (dependent variable) 
df_telecom = telecom.drop(['churn'], axis=1)
X = telecom.drop(['churn'], axis=1)
y = telecom['churn']

In [5]:
#list(master_df)
master_df.shape

(28504, 144)

### Data Standardization/Normalization

In [6]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

### Split Data into Train & Test

In [7]:
# Split in train & Test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=1)

In [8]:
print("X_train Dataframe Shape {}".format(X_train.shape))
print("X_test Dataframe Shape {}".format(X_test.shape))

y_train_imb = (y_train != 0).sum()/(y_train == 0).sum()
y_test_imb = (y_test != 0).sum()/(y_test == 0).sum()
print("Imbalance in Train Data: {}".format(y_train_imb))
print("Imbalance in Test Data: {}".format(y_test_imb))

X_train Dataframe Shape (19952, 142)
X_test Dataframe Shape (8552, 142)
Imbalance in Train Data: 0.05941698083151914
Imbalance in Test Data: 0.059071207430340555


### Balance data set by oversampling

In [9]:
# (Training) Balance Data-Set --- SMOT
from imblearn.over_sampling import SMOTE

sm = SMOTE(kind = "regular")
X_tr,y_tr = sm.fit_sample(X_train,y_train)

In [10]:
print("X_tr Dataframe Shape {}".format(X_tr.shape))
print("y_tr Dataframe Shape {}".format(y_tr.shape))

data_imbalance = (y_tr != 0).sum()/(y_tr == 0).sum()
print("Imbalance in Train Data: {}".format(data_imbalance))

X_tr Dataframe Shape (37666, 142)
y_tr Dataframe Shape (37666,)
Imbalance in Train Data: 1.0


### Feature reduction using LASSO

In [11]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
 
lsvc = LinearSVC(C=0.001, penalty="l1", dual=False).fit(X_tr, y_tr)
model = SelectFromModel(lsvc, prefit=True)
X_lasso = model.transform(X_tr)
pos = model.get_support(indices=True)
 ### Feature reduction using RFE
print(X_lasso.shape)
print(pos)

(37666, 47)
[  0   1   3  10  13  14  20  23  29  34  39  40  47  49  52  53  58  65
  78  79  80  83  91  92  95  98 101 102 104 107 108 109 110 111 112 113
 116 118 120 121 125 128 132 135 139 140 141]


In [12]:

#feature vector for decision tree#feature 
lasso_features = list(df_telecom.columns[pos])
print("Features identified by LASSO for model buidling: ", lasso_features)

Features identified by LASSO for model buidling:  ['arpu_6', 'arpu_7', 'onnet_mou_6', 'roam_ic_mou_7', 'roam_og_mou_7', 'roam_og_mou_8', 'loc_og_t2m_mou_8', 'loc_og_t2f_mou_8', 'loc_og_mou_8', 'std_og_t2m_mou_7', 'std_og_mou_6', 'std_og_mou_7', 'spl_og_mou_8', 'og_others_7', 'total_og_mou_7', 'total_og_mou_8', 'loc_ic_t2m_mou_7', 'loc_ic_mou_8', 'total_ic_mou_6', 'total_ic_mou_7', 'total_ic_mou_8', 'spl_ic_mou_8', 'total_rech_num_7', 'total_rech_num_8', 'total_rech_amt_8', 'max_rech_amt_8', 'last_day_rch_amt_8', 'vol_2g_mb_6', 'vol_2g_mb_8', 'vol_3g_mb_8', 'monthly_2g_6', 'monthly_2g_7', 'monthly_2g_8', 'sachet_2g_6', 'sachet_2g_7', 'sachet_2g_8', 'monthly_3g_8', 'sachet_3g_7', 'aon', 'aug_vbc_3g', 'night6_1.0', 'night8_0.0', 'fb7_0.0', 'fb8_1.0', 'rech_days_left_6', 'rech_days_left_7', 'rech_days_left_8']


In [13]:
X_train = X_lasso
y_train = y_tr

In [14]:
print ("Feature space holds %d observations and %d features" % X_train.shape)
print ("Unique target labels:", np.unique(y_train))

Feature space holds 37666 observations and 47 features
Unique target labels: [0 1]


### Decision Tree with default hyperparameter

In [15]:
# Importing decision tree classifier from sklearn library
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dt_default = DecisionTreeClassifier(max_depth=5)
dt_default.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [16]:
# Let's check the evaluation metrics of our default model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Making predictions
X_test = pd.DataFrame(data=X_test).iloc[:, pos]
y_pred_default = dt_default.predict(X_test)

# Printing classification report
print(classification_report(y_test, y_pred_default))

             precision    recall  f1-score   support

          0       0.98      0.87      0.92      8075
          1       0.26      0.75      0.38       477

avg / total       0.94      0.87      0.89      8552



In [17]:
# Printing confusion matrix and accuracy
print(confusion_matrix(y_test,y_pred_default))
print(accuracy_score(y_test,y_pred_default))

[[7043 1032]
 [ 120  357]]
0.865294667914


### Hyperparameter Tuning

NOTE: 
1. Hyperparameter Tunning is commented as it takes heavy computing power and time. It can be run by uncommenting it.
2. Getting 86% accuracy that looks to be pretty good.

### Tuning max_depth

In [18]:
# # GridSearchCV to find optimal max_depth
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'max_depth': range(1, 40)}

# # instantiate the model
# dtree = DecisionTreeClassifier(criterion = "gini", 
#                                random_state = 100)

# # fit tree on training data
# tree = GridSearchCV(dtree, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# tree.fit(X_train, y_train)

In [19]:
# # scores of GridSearch CV
# scores = tree.cv_results_
# pd.DataFrame(scores).head()

In [20]:
# # plotting accuracies with max_depth
# plt.figure()
# plt.plot(scores["param_max_depth"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_max_depth"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("max_depth")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

conclusion for max depth:
You can see that as we increase the value of max_depth, both training and test score increase till about max-depth = 10, after which the test score is constant. Note that the scores are average accuracies across the 5-folds.

we can consider max_depth=10

### Tuning min_samples_leaf

In [21]:
# # GridSearchCV to find optimal max_depth
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'min_samples_leaf': range(5, 200, 20)}

# # instantiate the model
# dtree = DecisionTreeClassifier(criterion = "gini", 
#                                random_state = 100)

# # fit tree on training data
# tree = GridSearchCV(dtree, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# tree.fit(X_train, y_train)

In [22]:
# # scores of GridSearch CV
# scores = tree.cv_results_
# pd.DataFrame(scores).head()

In [23]:
# # plotting accuracies with min_samples_leaf
# plt.figure()
# plt.plot(scores["param_min_samples_leaf"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_min_samples_leaf"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("min_samples_leaf")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

conclusion for min_samples_leaf:
at low values of min_samples_leaf seems overfitted. At values 125,the model becomes more stable and the training and test accuracy start to converge.
min_samples_leaf=125

### Tuning min_samples_split

In [24]:
# # GridSearchCV to find optimal min_samples_split
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'min_samples_split': range(5, 200, 20)}

# # instantiate the model
# dtree = DecisionTreeClassifier(criterion = "gini", 
#                                random_state = 100)

# # fit tree on training data
# tree = GridSearchCV(dtree, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# tree.fit(X_train, y_train)

In [25]:
# # scores of GridSearch CV
# scores = tree.cv_results_
# pd.DataFrame(scores).head()

In [26]:
# # plotting accuracies with min_samples_leaf
# plt.figure()
# plt.plot(scores["param_min_samples_split"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_min_samples_split"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("min_samples_split")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

as increase min_samples_split, the tree overfits lesser since the model is less complex

In [27]:
# # Create the parameter grid 
# param_grid = {
#     'max_depth': range(5, 15, 5),
#     'min_samples_leaf': range(50, 150, 50),
#     'min_samples_split': range(50, 150, 50),
#     'criterion': ["entropy", "gini"]
# }

# n_folds = 5

# # Instantiate the grid search model
# dtree = DecisionTreeClassifier()
# grid_search = GridSearchCV(estimator = dtree, param_grid = param_grid, 
#                           cv = n_folds, verbose = 1)

# # Fit the grid search to the data
# grid_search.fit(X_train,y_train)

In [28]:
# # cv results
# cv_results = pd.DataFrame(grid_search.cv_results_)
# cv_results

In [29]:
# # printing the optimal accuracy score and hyperparameters
# print("best accuracy", grid_search.best_score_)
# print(grid_search.best_estimator_)

In [30]:
# # model with optimal hyperparameters
# clf_gini = DecisionTreeClassifier(criterion = "gini", 
#                                   random_state = 100,
#                                   max_depth=10, 
#                                   min_samples_leaf=50,
#                                   min_samples_split=50)
# clf_gini.fit(X_train, y_train)

In [31]:
# # accuracy score
# clf_gini.score(X_test,y_test)

# <font color='blue'>SUMMARY PART 3: LASSO & DECISSION TREE</font>
OBSERVATIONS
1. 47 features identified by LASSO for model buiding.
2. Getting around 86% accuracy 
3. Confusion matix shows lot of false positives still exist.

NEXT STEPS:
1. Try Random Forrest.