### Comparing  Methods 
Predict customer churn with 
    - Logistic Regression
    - Decision Tree
    - Support Vector Machine
    - K-Nearest Neighbor 
    - Neural Network methods

Using 5-Fold cv with shuffling. Drop ID column. Remove rows which have missing data
Using default parameters

### Import Libraries
---

In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Preparing & Exploring Data
---

In [3]:
data = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv", sep=",") #Reading data
print("Shape of data = ",data.shape)


#Drop "Date" Column
data = data.drop(['customerID'], axis=1) 
print("Shape of data after drop column = ",   data.shape, "\n\n")

#To remove rows which have empty values first convert empty cells to nan
data = data.replace(' ', np.nan)
print('Null Values: \n', data.isnull().any()) #Control which column has nan values
print("----------------------\n")

empty_cells = np.where(pd.isnull(data)) 
print("Number of rows with empty cells: {}".format(len(empty_cells[0])),"\n")
data = data.dropna() #Remove rows
print("Shape of data after remove rows= ",   data.shape)

Shape of data =  (7043, 21)
Shape of data after drop column =  (7043, 20) 


Null Values: 
 gender              False
SeniorCitizen       False
Partner             False
Dependents          False
tenure              False
PhoneService        False
MultipleLines       False
InternetService     False
OnlineSecurity      False
OnlineBackup        False
DeviceProtection    False
TechSupport         False
StreamingTV         False
StreamingMovies     False
Contract            False
PaperlessBilling    False
PaymentMethod       False
MonthlyCharges      False
TotalCharges         True
Churn               False
dtype: bool
----------------------

Number of rows with empty cells: 11 

Shape of data after remove rows=  (7032, 20)


In [4]:
from sklearn.preprocessing import LabelEncoder

#Non-numeric features
categorilcals = ['gender', 'Partner', 'Dependents', 
                 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'Churn']
#Encoding non-numeric features
labelencoder = LabelEncoder()
for word in (categorilcals):
    data[word]      = labelencoder.fit_transform(data[word]) 

#Divide data as features(X) and target(y)    
X = data.iloc[:, 0:19].values.astype(float)
y = data.iloc[:, -1].values.astype(float)


#Scaling Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier


def problem3(model_selected, features, target, cv):
    
    
    """
    Input:
        
        features : Features
        target   : Target.(Labels)
        cv : cross-validation generator 
        model_selected : To select which model will use the train features
            List of 5 Possible Models:
                
                'lr'  = Logistic Regression
                'mlp' = MLP Classifier (Neural Network Model)
                'dtc' = Decision Tree Classifier
                'svc' = Linear Support Vector Machine
                'knn' = k-Nearest Neighbors 
        
    Returns:
    
        results : Mean of results
        
    """
    
    if(model_selected == 'lr'):
         model= LogisticRegression()

    elif(model_selected == 'mlp'):
         model= MLPClassifier()

    elif(model_selected == 'dtc'): 
         model= DecisionTreeClassifier()

    elif(model_selected == 'svc'):
         model= LinearSVC()

    elif(model_selected == 'knn'):
         model= KNeighborsClassifier()

    else:
         raise NameError('invalid selected model')
         

    results= cross_val_score(model, features, target, cv=cv)   

    return results.mean()

In [6]:
#Split data as Traing and Test
from sklearn.model_selection import train_test_split, cross_val_score
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

# 5-Fold with shuffling
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=0)


#Call function and keep results
train_lr = problem3('lr', X_train, y_train, kfold)
test_lr = problem3('lr', X_test, y_test, kfold)

train_dtc = problem3('dtc', X_train, y_train, kfold)
test_dtc = problem3('dtc', X_test, y_test, kfold)

train_svc = problem3('svc', X_train, y_train, kfold)
test_svc = problem3('svc', X_test, y_test, kfold)

train_knn = problem3('knn', X_train, y_train, kfold)
test_knn = problem3('knn', X_test, y_test, kfold)

train_mlp = problem3('mlp', X_train, y_train, kfold)
test_mlp = problem3('mlp', X_test, y_test, kfold)

    



In [7]:
print("model                train   test")
print("-------------------------------------")
print("LogisticRegression   {:.3f}   {:.3f}   ".format(train_lr,  test_lr))
print("DecisionTree         {:.3f}   {:.3f}   ".format(train_dtc, test_dtc))
print("LinearSVC            {:.3f}   {:.3f}   ".format(train_svc,  test_svc))
print("KNN                  {:.3f}   {:.3f}   ".format(train_knn,  test_knn))
print("MLPClassifier        {:.3f}   {:.3f}   ".format(train_mlp,  test_mlp))




model                train   test
-------------------------------------
LogisticRegression   0.800   0.808   
DecisionTree         0.729   0.739   
LinearSVC            0.799   0.802   
KNN                  0.754   0.746   
MLPClassifier        0.780   0.785   
