# Putting everything together

Let's start by loading the dataset:

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

np.random.seed(40)

data = pd.read_csv("churn.csv",sep=',',index_col=0)

y = data['Churn']
X = data.drop('Churn',axis=1)

print(y.value_counts())
#print(data.describe(include='all'))

 False.    4293
 True.      707
Name: Churn, dtype: int64


## Pre-processing the data

In [2]:
def convert_and_remove_categorical_variables(X, to_convert, to_remove):
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    from pandas.api.types import is_numeric_dtype

    for variable in X.columns:    
        if variable in to_convert:
            if len(X[variable].unique()) < 10:
                X = pd.concat([X,pd.get_dummies(X[variable], prefix=variable, drop_first=True)],axis=1).drop([variable],axis=1)  
        elif variable in to_remove:
            X = X.drop([variable],axis=1)
    return X

In [3]:
to_convert = ['Area_Code','International_Plan','Voice_mail_Plan']
to_remove = ['Phone_Number']
X = data.drop('Churn',axis=1)
X = convert_and_remove_categorical_variables(X, to_convert, to_remove)

In [4]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()    
y = encoder.fit_transform(y)

## Setting up the whole pipeline

Now, let's put everything together: make a function that again creates a cross-validation setup, and which also applies feature selection and other helpful pre-processing or feature generation techniques in its pipeline. More specifically, extend your code so that not only different models, but also different selection techniques, are used, as defined in the list below. Again, the stratified shuffling should be an option, and an extra parameter (select_best) is used for the feature selection techniques where necessary:

In [5]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.pipeline import make_pipeline

 
def evaluation_process(X_train, y_train, classifier, n_fold, selection_technique, select_best, oversampling):
    from sklearn.decomposition import PCA
    from sklearn.linear_model import LogisticRegression as LR
    from sklearn.tree import DecisionTreeClassifier as DT
    from sklearn.ensemble import RandomForestClassifier as RF
    from imblearn.pipeline import make_pipeline
    from sklearn.preprocessing import Normalizer
    from sklearn.model_selection import cross_validate
    from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif
    from sklearn.feature_selection import SelectKBest
    from sklearn.model_selection import ShuffleSplit, KFold, StratifiedKFold
    from sklearn.model_selection import train_test_split as tts
    from sklearn.model_selection import StratifiedShuffleSplit
    from imblearn.over_sampling import SMOTE

    import warnings
    warnings.filterwarnings("ignore")
    np.random.seed(42)
    
    stratifiedk_fold = StratifiedKFold(n_splits = n_fold)
    metrics = ['accuracy','precision','roc_auc']
    
    accuracy = 0
    precision = 0
    auroc = 0
    normalizer = Normalizer()
    pipeline = make_pipeline(Normalizer())
    if oversampling == True:
        smt = SMOTE(random_state = 42)
        pipeline.steps.append(('SMOTE', smt))
        #pipeline.steps.append(('Normalizer', Normalizer()))
           
    if selection_technique == 'chi2': 
        pipeline.steps.append(('chi2', SelectKBest(chi2, k = select_best)))
    elif selection_technique == 'mutual_information':
        pipeline.steps.append(('mutual_information', SelectKBest(mutual_info_classif, k = select_best)))
    elif selection_technique == 'PCA':
        pipeline.steps.append(('PCA', PCA()))
       
    pipeline.steps.append(('classifier', classifier))
    
    outcomes = cross_validate(pipeline, X_train, y_train, scoring=metrics, cv=stratifiedk_fold, return_train_score=False)
         
    accuracy += (np.average(outcomes['test_accuracy']))
    precision += (np.average(outcomes['test_precision']))
    auroc += (np.average(outcomes['test_roc_auc']))
                                                 
    return accuracy, precision, auroc


## The evaluation

In [6]:
from sklearn.model_selection import train_test_split as tts
import warnings
warnings.filterwarnings("ignore")

np.random.seed(42)
select_best = 2

# Our training and test set:
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.3)

# Our parameters:
models = [LR(), DT(), RF()]
names = ['LogReg','DecTree','RandomFor']
selection_techniques = ['None','PCA','chi2','mutual_information']

best_mean = 0
acc = []
prec = []
auro = []

# Verify your result:
accuracy, precision, auroc = evaluation_process(X_train, y_train, RF(), 10, 'mutual_information', 10, False) 
print('Accuracy '+str(accuracy)+", precision: "+str(precision)+" AUC: "+str(auroc))

Accuracy 0.914, precision: 0.8413222662220161 AUC: 0.8660133333333334


In [7]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###
