In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chisquare

In [3]:
import scipy.stats as stats
from scipy.stats import chi2_contingency

In [4]:
import xgboost as xgb

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, log_loss

In [7]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

In [8]:
ds = pd.read_csv("cl_dataset.csv")

In [9]:
ds.head()

Unnamed: 0,Subscription.Id,Subscription.Mode,Subscription.Purchase.Date,Subscription.Cancellation.Date,Churn.Ind,Country.of.subscription.purchase,Purchase.Channel,Subscription.sold.by.Carrier,Store.id.of.subscription.sale,Device.Covered...Product.Line,Device.Covered...SKU,Device.Covered...Capacity,Device.Covered...Color,time_taken_cancel
0,34645276,SubscriptionProductCarrier,2018-09-21,,N,Country20,Channel1,Carrier7,12458799.0,ProductLine3,SKU70,Low,crimson,
1,34645277,SubscriptionProduct6,2018-11-23,2018-12-08,Y,Country20,Channel8,NonCarrier,12054878.0,ProductLine3,SKU72,High,crimson,15.0
2,34645278,SubscriptionProductCarrier,2018-11-03,,N,Country20,Channel1,Carrier8,10457799.0,ProductLine3,SKU4,Very Low,Bronze,
3,34645279,SubscriptionProduct6,2018-10-22,,N,Country20,Channel3,NonCarrier,14964174.0,ProductLine3,SKU72,High,Bronze,
4,34645280,SubscriptionProduct4,2018-10-20,,N,Country20,Channel4,NonCarrier,,ProductLine3,SKU72,High,crimson,


In [11]:
ds.shape

(110000, 14)

In [10]:
ds.describe()

Unnamed: 0,Subscription.Id,Store.id.of.subscription.sale,time_taken_cancel
count,110000.0,107801.0,10000.0
mean,34700280.0,12058760.0,20.2569
std,31754.41,1498346.0,20.055636
min,34645280.0,9946575.0,0.0
25%,34672780.0,10371680.0,4.0
50%,34700280.0,12458800.0,14.0
75%,34727780.0,13244430.0,30.0
max,34755280.0,14986600.0,118.0


## Dummy Variables
#### We mostly have categorical variables with  many levels in the dataset. To include in the model we create dummy variables


In [11]:
dummy_fields = ['Country.of.subscription.purchase', 'Purchase.Channel', 'Subscription.Mode', 'Device.Covered...Product.Line', 'Device.Covered...Capacity']

### Iterate through the varibles where dummy variables is to be created and later drop the unrequired fields

In [12]:
for each in dummy_fields:
    dummies = pd.get_dummies(ds[each], prefix=each, drop_first=False)
    ds = pd.concat([ds, dummies], axis=1)

In [13]:
fields_to_drop = ['Subscription.Mode', 'Subscription.Purchase.Date', 'Subscription.Cancellation.Date', 'Country.of.subscription.purchase', 
                  'Purchase.Channel', 'Subscription.sold.by.Carrier', 'Store.id.of.subscription.sale', 'Device.Covered...Product.Line', 
                  'Device.Covered...Capacity','Device.Covered...SKU','Device.Covered...Color','time_taken_cancel']

In [15]:
data = ds.drop(fields_to_drop, axis=1)
data.head()

Unnamed: 0,Subscription.Id,Churn.Ind,Country.of.subscription.purchase_Country1,Country.of.subscription.purchase_Country10,Country.of.subscription.purchase_Country11,Country.of.subscription.purchase_Country12,Country.of.subscription.purchase_Country13,Country.of.subscription.purchase_Country14,Country.of.subscription.purchase_Country15,Country.of.subscription.purchase_Country16,...,Device.Covered...Product.Line_ProductLine8,Device.Covered...Capacity_(null),Device.Covered...Capacity_High,Device.Covered...Capacity_Low,Device.Covered...Capacity_Medium,Device.Covered...Capacity_Very High,Device.Covered...Capacity_Very Low,Device.Covered...Capacity_Very Very High,Device.Covered...Capacity_Very Very Low,Device.Covered...Capacity_unspecified
0,34645276,N,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,34645277,Y,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,34645278,N,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,34645279,N,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,34645280,N,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Convert the dummy variables columns to category and drop the other unrequired variable

In [16]:
cols = data.columns.drop('Subscription.Id','Churn.Ind')

In [17]:
data[cols] = data[cols].astype('category')

In [18]:
data.dtypes

Subscription.Id                                    int64
Churn.Ind                                       category
Country.of.subscription.purchase_Country1       category
Country.of.subscription.purchase_Country10      category
Country.of.subscription.purchase_Country11      category
Country.of.subscription.purchase_Country12      category
Country.of.subscription.purchase_Country13      category
Country.of.subscription.purchase_Country14      category
Country.of.subscription.purchase_Country15      category
Country.of.subscription.purchase_Country16      category
Country.of.subscription.purchase_Country17      category
Country.of.subscription.purchase_Country18      category
Country.of.subscription.purchase_Country19      category
Country.of.subscription.purchase_Country2       category
Country.of.subscription.purchase_Country20      category
Country.of.subscription.purchase_Country3       category
Country.of.subscription.purchase_Country4       category
Country.of.subscription.purchas

In [19]:
data = data.drop('Subscription.Id',axis=1)

In [20]:
data.iloc[:, 0:2].head()

Unnamed: 0,Churn.Ind,Country.of.subscription.purchase_Country1
0,N,0
1,Y,0
2,N,0
3,N,0
4,N,0


In [22]:
data.head()

Unnamed: 0,Churn.Ind,Country.of.subscription.purchase_Country1,Country.of.subscription.purchase_Country10,Country.of.subscription.purchase_Country11,Country.of.subscription.purchase_Country12,Country.of.subscription.purchase_Country13,Country.of.subscription.purchase_Country14,Country.of.subscription.purchase_Country15,Country.of.subscription.purchase_Country16,Country.of.subscription.purchase_Country17,...,Device.Covered...Product.Line_ProductLine8,Device.Covered...Capacity_(null),Device.Covered...Capacity_High,Device.Covered...Capacity_Low,Device.Covered...Capacity_Medium,Device.Covered...Capacity_Very High,Device.Covered...Capacity_Very Low,Device.Covered...Capacity_Very Very High,Device.Covered...Capacity_Very Very Low,Device.Covered...Capacity_unspecified
0,N,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,Y,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,N,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,N,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [23]:
chi_cols = data.columns.drop('Churn.Ind')

## Chi Square Function
#### Create a function to calculate the chi square statistic and append p-value, X-statistic and corresponding feature in a dataframe and if p-value is less than 0.05 feature is important else not important

In [24]:
  def chisquare(colX,colY, alpha=0.05):
        global dfExpected,cols_chi,p,chi2,dof
        X = data[colX].astype(str)
        Y = data[colY].astype(str)
        
        
        dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(dfObserved.values)
        p = p
        chi2 = chi2
        dof = dof 
        dfExpected = pd.DataFrame([[colX,p,chi2]],columns = ['Feature','p','chi_square_value'])
        

In [25]:
df = pd.DataFrame(columns=['Feature','p','chi_square_value'])

In [26]:
for var in chi_cols:
    chisquare(var,"Churn.Ind" )
    df = df.append(dfExpected)

In [27]:
def importance_func(p_imp):
    if p_imp['p'] <= 0.05:
        val = 'Important'
    else:
        val = 'Not Important'
    return val

In [28]:
df['Importance'] = df.apply(importance_func, axis=1)

In [29]:
df

Unnamed: 0,Feature,p,chi_square_value,Importance
0,Country.of.subscription.purchase_Country1,9.401692e-07,24.046912,Important
0,Country.of.subscription.purchase_Country10,6.304584e-41,179.476803,Important
0,Country.of.subscription.purchase_Country11,5.046587e-01,0.445126,Not Important
0,Country.of.subscription.purchase_Country12,5.006252e-08,29.714363,Important
0,Country.of.subscription.purchase_Country13,8.834187e-01,0.021502,Not Important
0,Country.of.subscription.purchase_Country14,8.447095e-02,2.976707,Not Important
0,Country.of.subscription.purchase_Country15,5.376611e-03,7.748200,Important
0,Country.of.subscription.purchase_Country16,2.577929e-01,1.280571,Not Important
0,Country.of.subscription.purchase_Country17,9.834804e-03,6.664576,Important
0,Country.of.subscription.purchase_Country18,3.544022e-04,12.758462,Important


## Top Features by X-Statistic of the dataset
#### Based on the sorted output Purchase Channel 1, Subscription Mode - Product Carrier , Country 20, Product Line8 and Capacity Low are the top 5 features for Churn

In [30]:
dfsorted = df.sort_values('chi_square_value',ascending= False)
dfsorted

Unnamed: 0,Feature,p,chi_square_value,Importance
0,Purchase.Channel_Channel1,1.240920e-86,389.192167,Important
0,Subscription.Mode_SubscriptionProductCarrier,1.240920e-86,389.192167,Important
0,Country.of.subscription.purchase_Country20,3.229806e-61,272.504028,Important
0,Subscription.Mode_SubscriptionProduct8,5.694493e-54,239.262772,Important
0,Device.Covered...Capacity_Low,5.523545e-46,202.646990,Important
0,Country.of.subscription.purchase_Country10,6.304584e-41,179.476803,Important
0,Purchase.Channel_Channel2,1.318973e-33,145.968462,Important
0,Purchase.Channel_Channel9,7.684512e-30,128.751978,Important
0,Purchase.Channel_Channel15,1.662981e-28,122.650632,Important
0,Device.Covered...Capacity_Medium,1.522942e-26,113.690850,Important


In [31]:
#Get the top 15 features based on Chi Square test
new_features = dfsorted.nlargest(15,'chi_square_value')
features_values = new_features['Feature'].values.tolist()
features_values

['Purchase.Channel_Channel1',
 'Subscription.Mode_SubscriptionProductCarrier',
 'Country.of.subscription.purchase_Country20',
 'Subscription.Mode_SubscriptionProduct8',
 'Device.Covered...Capacity_Low',
 'Country.of.subscription.purchase_Country10',
 'Purchase.Channel_Channel2',
 'Purchase.Channel_Channel9',
 'Purchase.Channel_Channel15',
 'Device.Covered...Capacity_Medium',
 'Purchase.Channel_Channel13',
 'Country.of.subscription.purchase_Country4',
 'Device.Covered...Product.Line_ProductLine2',
 'Purchase.Channel_Channel12',
 'Purchase.Channel_Channel6']

## Modeling Section
#### Intial modeled using all 62 dummy variables and then processed with top 15 variables. Code run down is shown for top 15  variables. Algorithms performed are Logistic Regression, Support Vector Machines and Random Forest.
#### Performance Metrics Accuracy, precision, recall and f- measure are calculated
#### Intially the Features and response variables have been created and then dataset is split into test and train with 80-20%

In [32]:
# create X (features) and y (response)
X = data[features_values]
y = data['Churn.Ind']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [35]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)[:, 1]



In [36]:
(classification_report(y_test, y_pred))

  'precision', 'predicted', average, warn_for)


'              precision    recall  f1-score   support\n\n           N       0.91      1.00      0.95     20030\n           Y       0.00      0.00      0.00      1970\n\n   micro avg       0.91      0.91      0.91     22000\n   macro avg       0.46      0.50      0.48     22000\nweighted avg       0.83      0.91      0.87     22000\n'

In [41]:
print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y_test, y_pred))
#print(logreg.__class__.__name__+" log_loss is %2.3f" % log_loss(y_test, y_pred))


LogisticRegression accuracy is 0.910


In [259]:
svclassifier = SVC(kernel='rbf')  
svclassifier.fit(X_train, y_train)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [260]:
y_pred = svclassifier.predict(X_test) 

In [36]:
def churn_func(chrun):
    if chrun['Churn.Ind'] == 'Y':
        val = 1
    else:
        val = 0
    return val

In [41]:
# create X (features) and y (response)
X1 = data_copy[features_values]
y1 = data_copy['Churn.Ind']

In [42]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=2)

In [43]:
from xgboost import XGBClassifier