# Machine Learning in Marketing by Python

## 1. Churn Predictions with Decision Tree

In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score

In [2]:
telco = pd.read_csv('telco.csv')
print(telco.head(5))

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [3]:
telco_raw = telco.copy()

# print the data types
print(telco_raw.dtypes)

# print the header
print(telco_raw.head(3))

# print unique values of each column
print(telco_raw.nunique())

# check the proportion of missing data
telco_raw.isnull().mean()

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0

customerID          0.0
gender              0.0
SeniorCitizen       0.0
Partner             0.0
Dependents          0.0
tenure              0.0
PhoneService        0.0
MultipleLines       0.0
InternetService     0.0
OnlineSecurity      0.0
OnlineBackup        0.0
DeviceProtection    0.0
TechSupport         0.0
StreamingTV         0.0
StreamingMovies     0.0
Contract            0.0
PaperlessBilling    0.0
PaymentMethod       0.0
MonthlyCharges      0.0
TotalCharges        0.0
Churn               0.0
dtype: float64

In [11]:
# Store numerical and categorical data

## change data type

## store customerId and Churn column names
custid = ['customerID']
target = ['Churn']

## store categorical column names
categorical = telco_raw.nunique()[telco_raw.nunique()<5].keys().tolist()
categorical.remove(target[0])
categorical.remove('SeniorCitizen')
print(f'Categorical columns: {categorical}')
print()

## store numerical column names
numerical = [col for col in telco_raw.columns
                 if col not in custid + target + categorical]
numerical.remove('SeniorCitizen')
print(f'Numerical columns: {numerical}')

Categorical columns: ['gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']

Numerical columns: ['tenure', 'MonthlyCharges', 'TotalCharges']


In [12]:
# check numerical columns' property
print(telco_raw[numerical].describe())

# convert to numerical datatype
telco_raw['TotalCharges'] = pd.to_numeric(telco_raw['TotalCharges'], errors = 'coerce')
print(telco_raw['TotalCharges'].describe())

# drop na values
telco_raw = telco_raw.dropna(axis=0)

print(telco_raw.describe())

            tenure  MonthlyCharges  TotalCharges
count  7032.000000     7032.000000   7032.000000
mean     32.421786       64.798208   2283.300441
std      24.545260       30.085974   2266.771362
min       1.000000       18.250000     18.800000
25%       9.000000       35.587500    401.450000
50%      29.000000       70.350000   1397.475000
75%      55.000000       89.862500   3794.737500
max      72.000000      118.750000   8684.800000
count    7032.000000
mean     2283.300441
std      2266.771362
min        18.800000
25%       401.450000
50%      1397.475000
75%      3794.737500
max      8684.800000
Name: TotalCharges, dtype: float64
       SeniorCitizen       tenure  MonthlyCharges  TotalCharges  gender_Male  \
count    7032.000000  7032.000000     7032.000000   7032.000000  7032.000000   
mean        0.162400    32.421786       64.798208   2283.300441     0.504693   
std         0.368844    24.545260       30.085974   2266.771362     0.500014   
min         0.000000     1.000000   

While trying to examine properties of numerical columns, there is only information of two columns showing, and column `TotalCharges` is missing. Which means that TotalCharges needs to be changed its datatype.Therefore, I converted this column by its datatype, from string to float, and to ensure all the value can be changed to numerical values, I set the false one to coerce. After the cleaning process, the entire dataset has 7032 rows instead of original 7043 rows.

In [13]:
# perform one-hot encoding to categorical variables
telco_raw = pd.get_dummies(data=telco_raw, columns = categorical, drop_first = True)

# initialize StandardScaler
scaler = StandardScaler()

# fit and transform the scaler on numerical columns
scaled_numerical = scaler.fit_transform(telco_raw[numerical])
scaled_numerical = pd.DataFrame(scaled_numerical, columns = numerical)

        tenure  MonthlyCharges  TotalCharges
0    -1.280248       -1.161694     -0.994194
1     0.064303       -0.260878     -0.173740
2    -1.239504       -0.363923     -0.959649
3     0.512486       -0.747850     -0.195248
4    -1.239504        0.196178     -0.940457
...        ...             ...           ...
7027 -0.343137        0.664868     -0.129180
7028  1.612573        1.276493      2.241056
7029 -0.872808       -1.170004     -0.854514
7030 -1.158016        0.319168     -0.872095
7031  1.368109        1.357932      2.012344

[7032 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 32 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   customerID                               7032 non-null   object 
 1   SeniorCitizen                            7032 non-null   int64  
 2   tenure                                   70

In [27]:
# obtain features X 
X = telco_raw.copy()
X = X.drop(columns = ['Churn', 'customerID'])
X = shuffle(X, random_state=2022)

# obtain Y
Y = telco_raw.copy()['Churn']
Y = pd.get_dummies(data=Y, drop_first=True)
Y = shuffle(Y, random_state=2022)

In [33]:
# split X and Y into training and testing datasets
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.25, random_state = 2022) 

# ensure the proportion of training data and testing data are 75% and 25%
print(train_X.shape[0] / X.shape[0])
print(test_X.shape[0] / X.shape[0])

0.75
0.25


In [38]:
# Fit a decision tree

## Initialize the model 
mytree = tree.DecisionTreeClassifier(max_depth = 5)

## Fit the model on the training data
treemodel = mytree.fit(train_X, train_Y)

## Predict values on the testing data
pred_Y = treemodel.predict(test_X)

## Measure model performance on testing data
print(f'Accuracy score is : {accuracy_score(test_Y, pred_Y)}')


Accuracy score is : 0.7679180887372014


In [44]:
# initialize the Decision Tree
clf = tree.DecisionTreeClassifier(max_depth = 7, criterion = 'gini', splitter = 'best')

# fit the model
clf = clf.fit(train_X, train_Y)

# predict the values on test dataset
pred_Y = clf.predict(test_X)

# print accuracy values
print("Training Accuracy is ", np.round(clf.score(train_X, train_Y),3))
print("Testing Accuracy is ", np.round(accuracy_score(test_Y, pred_Y),3))

Training Accuracy is  0.829
Testing Accuracy is  0.757
