In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns;
sns.set_style('darkgrid')
from collections import Counter

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import warnings
warnings.simplefilter('ignore')


#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:95% !important; }</style>"))

# Modeling
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [5]:
!pwd

/home/jhon/Documents/Personal_Plan/Project_3_ChurnRatePrediction/projectChurnRate


# 1. Dataset 
Dataset source: https://github.com/Paritoshyadav/Minimizing-Churn-Rate-Through-Analysis-of-Financial-Habits



## Dataset:

In [4]:
# Loading the data

dfChurnBank = pd.read_csv('../projectChurnRate/Data/churn_data.csv', index_col=0)

print("Size of the dataset:  %d" % dfChurnBank.shape[0])
print("Number of variables: %d" % dfChurnBank.shape[1])

if dfChurnBank.index.is_unique:
    print('Indexes are unique.')
else:
    print('There are duplicated indexes.')
    
dfChurnBank.head()

Size of the dataset:  27000
Number of variables: 30
There are duplicated indexes.


Unnamed: 0_level_0,churn,age,housing,credit_score,deposits,withdrawal,purchases_partners,purchases,cc_taken,cc_recommended,...,waiting_4_loan,cancelled_loan,received_loan,rejected_loan,zodiac_sign,left_for_two_month_plus,left_for_one_month,rewards_earned,reward_rate,is_referred
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55409,0,37.0,na,,0,0,0,0,0,0,...,0,0,0,0,Leo,1,0,,0.0,0
23547,0,28.0,R,486.0,0,0,1,0,0,96,...,0,0,0,0,Leo,0,0,44.0,1.47,1
58313,0,35.0,R,561.0,47,2,86,47,0,285,...,0,0,0,0,Capricorn,1,0,65.0,2.17,0
8095,0,26.0,R,567.0,26,3,38,25,0,74,...,0,0,0,0,Capricorn,0,0,33.0,1.1,1
61353,1,27.0,na,,0,0,2,0,0,0,...,0,0,0,0,Aries,1,0,1.0,0.03,0


In [5]:
# Split dataset for modeling and predicting

trainLen = int(len(dfChurnBank)*0.8)
dfChurnBank = dfChurnBank.sample(frac=1, random_state=0)
df = dfChurnBank.iloc[:trainLen].copy()
df_pred = dfChurnBank.iloc[trainLen:].copy()
print('Size of dataset for modeling is {}'.format(len(df)))
print('Size of dataset for prediction is {}'.format(len(df_pred)))

Size of dataset for modeling is 21600
Size of dataset for prediction is 5400


# 3. Pre-processing

1. Remove Null and Nan values
2. Remove duplicated indexes
3. Remove app_web_user, deposits, ios_user, cc_recommended, cancelled_loan', 'received_loan', 'rejected_loan' columns
4. Update numerical and categorical values for later modeling
5. For Categical variables, Convert categorical variable into dummy/indicator variables for categorical values to remove 'na' values
6. For Numerical variables, Remove outliers
7. Create pre-process pipeline function for next preprocesing for test data.


In [11]:
print("Size of the dataset:  %d" % df.shape[0])
print("Number of variables: %d" % df.shape[1])
df.head()

Size of the dataset:  21600
Number of variables: 30


Unnamed: 0_level_0,churn,age,housing,credit_score,deposits,withdrawal,purchases_partners,purchases,cc_taken,cc_recommended,...,waiting_4_loan,cancelled_loan,received_loan,rejected_loan,zodiac_sign,left_for_two_month_plus,left_for_one_month,rewards_earned,reward_rate,is_referred
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59394,1,21.0,na,,0,0,0,0,0,0,...,0,0,0,0,Aquarius,0,0,,0.0,0
50931,1,28.0,na,449.0,0,0,0,0,0,129,...,0,0,0,0,Aquarius,0,0,31.0,1.03,0
28738,0,27.0,R,513.0,58,8,65,58,0,294,...,0,0,0,0,Scorpio,0,1,66.0,2.2,1
55968,0,44.0,na,551.0,0,0,0,0,0,0,...,0,0,0,0,Aquarius,0,0,,0.0,0
50686,1,39.0,O,510.0,0,0,0,0,0,35,...,0,0,0,0,Capricorn,0,0,12.0,0.4,1


We create a pipeline_preprocess list in order to store all pre-process functions in order to apply later to the prediction dataset.

In [6]:
# Pipeline for pre-preocessing
pipeline_preprocess = []


## 1. Remove Null and Nan values

In [12]:
null_finder = df.isnull().sum()
print(" ***** Number of Null Values by row: ***** ")
null_finder.where(null_finder > 0).dropna()

 ***** Number of Null Values by row: ***** 


age                  4.0
credit_score      6446.0
rewards_earned    2588.0
dtype: float64

In [13]:
def dropnull(df):
    print("Removing columns credit_score and rewards_earned ...(1)")
    df = df.drop(columns=['credit_score','rewards_earned'])
    print("Drop null values from age column ...(2)")
    df = df[pd.notnull(df['age'])]
    return df

df = dropnull(df)

null_finder = df.isnull().sum()
print(" ***** Number of Null Values by row: ***** ")
null_finder.where(null_finder > 0).dropna()


Removing columns credit_score and rewards_earned ...(1)
Drop null values from age column ...(2)
 ***** Number of Null Values by row: ***** 


Series([], dtype: float64)

In [14]:
print("Size of the dataset:  %d" % df.shape[0])
print("Number of variables: %d" % df.shape[1])
df.head(10)

Size of the dataset:  21596
Number of variables: 28


Unnamed: 0_level_0,churn,age,housing,deposits,withdrawal,purchases_partners,purchases,cc_taken,cc_recommended,cc_disliked,...,payment_type,waiting_4_loan,cancelled_loan,received_loan,rejected_loan,zodiac_sign,left_for_two_month_plus,left_for_one_month,reward_rate,is_referred
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59394,1,21.0,na,0,0,0,0,0,0,0,...,Bi-Weekly,0,0,0,0,Aquarius,0,0,0.0,0
50931,1,28.0,na,0,0,0,0,0,129,1,...,Weekly,0,0,0,0,Aquarius,0,0,1.03,0
28738,0,27.0,R,58,8,65,58,0,294,0,...,na,0,0,0,0,Scorpio,0,1,2.2,1
55968,0,44.0,na,0,0,0,0,0,0,0,...,Monthly,0,0,0,0,Aquarius,0,0,0.0,0
50686,1,39.0,O,0,0,0,0,0,35,0,...,Bi-Weekly,0,0,0,0,Capricorn,0,0,0.4,1
36115,1,27.0,na,0,0,0,0,0,0,0,...,Monthly,0,0,0,0,Pisces,1,0,0.0,0
19748,1,30.0,R,0,0,1,0,1,255,0,...,Bi-Weekly,0,0,0,0,Gemini,1,0,1.73,0
46479,0,41.0,O,1,1,19,1,0,49,0,...,Bi-Weekly,0,0,0,0,Scorpio,0,0,0.6,0
15946,0,32.0,R,15,4,213,15,0,283,0,...,Bi-Weekly,0,0,0,0,Cancer,0,0,2.23,0
14304,1,23.0,na,0,0,0,0,0,204,0,...,Semi-Monthly,0,0,0,0,Virgo,0,0,2.57,0


In [15]:
# Adding to pre-preocessing pipeline:

pipeline_preprocess.append(dropnull)

## 2. Remove duplicated indexes

In [12]:
if df.index.is_unique:
    print('Indexes are unique.')
else:
    print('There are duplicated indexes.')

There are duplicated indexes.


In [16]:
# This function will be agregated to the pre-precessing pipeline:
def dropduplicated(df):
    if df.index.is_unique:
        print('Indexes are unique.')
        return df
    else:
        print('There are duplicated indexes....So removing duplicated indexes ...(3)')
        return df[~df.index.duplicated(keep='first')]

df = dropduplicated(df)


if df.index.is_unique:
    print('Indexes are unique.')
else:
    print('There are duplicated indexes.')

print("Size of the dataset:  %d" % df.shape[0])
print("Number of variables: %d" % df.shape[1])
df.head(10)

There are duplicated indexes....So removing duplicated indexes ...(3)
Indexes are unique.
Size of the dataset:  20105
Number of variables: 28


Unnamed: 0_level_0,churn,age,housing,deposits,withdrawal,purchases_partners,purchases,cc_taken,cc_recommended,cc_disliked,...,payment_type,waiting_4_loan,cancelled_loan,received_loan,rejected_loan,zodiac_sign,left_for_two_month_plus,left_for_one_month,reward_rate,is_referred
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59394,1,21.0,na,0,0,0,0,0,0,0,...,Bi-Weekly,0,0,0,0,Aquarius,0,0,0.0,0
50931,1,28.0,na,0,0,0,0,0,129,1,...,Weekly,0,0,0,0,Aquarius,0,0,1.03,0
28738,0,27.0,R,58,8,65,58,0,294,0,...,na,0,0,0,0,Scorpio,0,1,2.2,1
55968,0,44.0,na,0,0,0,0,0,0,0,...,Monthly,0,0,0,0,Aquarius,0,0,0.0,0
50686,1,39.0,O,0,0,0,0,0,35,0,...,Bi-Weekly,0,0,0,0,Capricorn,0,0,0.4,1
36115,1,27.0,na,0,0,0,0,0,0,0,...,Monthly,0,0,0,0,Pisces,1,0,0.0,0
19748,1,30.0,R,0,0,1,0,1,255,0,...,Bi-Weekly,0,0,0,0,Gemini,1,0,1.73,0
46479,0,41.0,O,1,1,19,1,0,49,0,...,Bi-Weekly,0,0,0,0,Scorpio,0,0,0.6,0
15946,0,32.0,R,15,4,213,15,0,283,0,...,Bi-Weekly,0,0,0,0,Cancer,0,0,2.23,0
14304,1,23.0,na,0,0,0,0,0,204,0,...,Semi-Monthly,0,0,0,0,Virgo,0,0,2.57,0


In [17]:
# Adding to pre-preocessing pipeline:

pipeline_preprocess.append(dropduplicated)

## 3. Remove app_web_user, deposit, ios_user, cc_recommended, cancelled_loan', 'received_loan', 'rejected_loan' columns

In [18]:

def dropcolumns(df):
    print("Drop app_web_user, deposit, ios_user, cc_recommended, cancelled_loan', 'received_loan', 'rejected_loan' columns ...(4)")
    df = df.drop(columns=['app_web_user','deposits', 'ios_user','cc_recommended', 'cancelled_loan', 'received_loan', 'rejected_loan'])
    return df
df = dropcolumns(df)

print("Size of the dataset:  %d" % df.shape[0])
print("Number of variables: %d" % df.shape[1])
df.head(10)


Drop app_web_user, deposit, ios_user, cc_recommended, cancelled_loan', 'received_loan', 'rejected_loan' columns ...(4)
Size of the dataset:  20105
Number of variables: 21


Unnamed: 0_level_0,churn,age,housing,withdrawal,purchases_partners,purchases,cc_taken,cc_disliked,cc_liked,cc_application_begin,...,web_user,android_user,registered_phones,payment_type,waiting_4_loan,zodiac_sign,left_for_two_month_plus,left_for_one_month,reward_rate,is_referred
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59394,1,21.0,na,0,0,0,0,0,0,0,...,0,1,0,Bi-Weekly,0,Aquarius,0,0,0.0,0
50931,1,28.0,na,0,0,0,0,1,0,8,...,1,1,2,Weekly,0,Aquarius,0,0,1.03,0
28738,0,27.0,R,8,65,58,0,0,0,29,...,0,0,2,na,0,Scorpio,0,1,2.2,1
55968,0,44.0,na,0,0,0,0,0,0,0,...,1,1,0,Monthly,0,Aquarius,0,0,0.0,0
50686,1,39.0,O,0,0,0,0,0,0,6,...,1,0,0,Bi-Weekly,0,Capricorn,0,0,0.4,1
36115,1,27.0,na,0,0,0,0,0,0,0,...,1,0,0,Monthly,0,Pisces,1,0,0.0,0
19748,1,30.0,R,0,1,0,1,0,0,27,...,1,1,0,Bi-Weekly,0,Gemini,1,0,1.73,0
46479,0,41.0,O,1,19,1,0,0,0,1,...,0,1,2,Bi-Weekly,0,Scorpio,0,0,0.6,0
15946,0,32.0,R,4,213,15,0,0,0,24,...,1,0,0,Bi-Weekly,0,Cancer,0,0,2.23,0
14304,1,23.0,na,0,0,0,0,0,0,14,...,0,1,0,Semi-Monthly,0,Virgo,0,0,2.57,0


In [19]:
# Adding to pre-preocessing pipeline:

pipeline_preprocess.append(dropcolumns)

## 4. Update numerical and categorical values for later modeling 

In [20]:
# Update numerical and categorical values for later modeling

cat_features = df.select_dtypes(exclude = np.number).columns
num_features = df.select_dtypes(include = np.number).columns
print( "Quantity of Categorical features: ", len(cat_features),"\nCategorical features: ", cat_features)
print( "\nQuantity of Numerical features: ", len(num_features),"\nNumerical features: ", num_features)

Quantity of Categorical features:  3 
Categorical features:  Index(['housing', 'payment_type', 'zodiac_sign'], dtype='object')

Quantity of Numerical features:  18 
Numerical features:  Index(['churn', 'age', 'withdrawal', 'purchases_partners', 'purchases',
       'cc_taken', 'cc_disliked', 'cc_liked', 'cc_application_begin',
       'app_downloaded', 'web_user', 'android_user', 'registered_phones',
       'waiting_4_loan', 'left_for_two_month_plus', 'left_for_one_month',
       'reward_rate', 'is_referred'],
      dtype='object')


## 5. Convert categorical variable into dummy/indicator variables for categorical values to remove 'na' values

In [21]:
def removedummy(df):
    print("Convert categorical values into numbers ...(5)")
    df_new = pd.get_dummies(df)
    print("Remove columns housing_na, zodiac_sign_na, payment_type_na ...(6)")
    df_new = df_new.drop(columns = ['housing_na', 'zodiac_sign_na', 'payment_type_na'])
    return df_new

df = removedummy(df)

print("Size of the dataset:  %d" % df.shape[0])
print("Number of variables: %d" % df.shape[1])
df.head(10)

Convert categorical values into numbers ...(5)
Remove columns housing_na, zodiac_sign_na, payment_type_na ...(6)
Size of the dataset:  20105
Number of variables: 36


Unnamed: 0_level_0,churn,age,withdrawal,purchases_partners,purchases,cc_taken,cc_disliked,cc_liked,cc_application_begin,app_downloaded,...,zodiac_sign_Cancer,zodiac_sign_Capricorn,zodiac_sign_Gemini,zodiac_sign_Leo,zodiac_sign_Libra,zodiac_sign_Pisces,zodiac_sign_Sagittarius,zodiac_sign_Scorpio,zodiac_sign_Taurus,zodiac_sign_Virgo
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59394,1,21.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
50931,1,28.0,0,0,0,0,1,0,8,1,...,0,0,0,0,0,0,0,0,0,0
28738,0,27.0,8,65,58,0,0,0,29,1,...,0,0,0,0,0,0,0,1,0,0
55968,0,44.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
50686,1,39.0,0,0,0,0,0,0,6,1,...,0,1,0,0,0,0,0,0,0,0
36115,1,27.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
19748,1,30.0,0,1,0,1,0,0,27,1,...,0,0,1,0,0,0,0,0,0,0
46479,0,41.0,1,19,1,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
15946,0,32.0,4,213,15,0,0,0,24,1,...,1,0,0,0,0,0,0,0,0,0
14304,1,23.0,0,0,0,0,0,0,14,1,...,0,0,0,0,0,0,0,0,0,1


In [22]:
# Adding to pre-preocessing pipeline:

pipeline_preprocess.append(removedummy)


## 5. Remove outliers

In [23]:
# Removing outliers
def remove_outlier(X):
    """
    X: dataframe
    """
    #X = df_new.iloc[:, :-1]
    print("Removing outliers ...(7)")
    for i in range(len(X.columns)):
        first_q = np.percentile(X[X.columns[i]], 25)
        third_q = np.percentile(X[X.columns[i]], 75) 
        IQR = 1.5*(third_q - first_q)
        minimum = first_q - IQR 
        maximum = third_q + IQR
    
        median = X[X.columns[i]].median()
    
        X.loc[X[X.columns[i]] < minimum, X.columns[i]] = median 
        X.loc[X[X.columns[i]] > maximum, X.columns[i]] = median
    return X

In [23]:
detect_outlier(df)

age There is Outlier
withdrawal There is Outlier
purchases_partners There is Outlier
purchases There is Outlier


In [24]:
df = remove_outlier(df)

print("Size of the dataset:  %d" % df.shape[0])
print("Number of variables: %d" % df.shape[1])
df.head(10)

Removing outliers ...(7)
Size of the dataset:  20105
Number of variables: 36


Unnamed: 0_level_0,churn,age,withdrawal,purchases_partners,purchases,cc_taken,cc_disliked,cc_liked,cc_application_begin,app_downloaded,...,zodiac_sign_Cancer,zodiac_sign_Capricorn,zodiac_sign_Gemini,zodiac_sign_Leo,zodiac_sign_Libra,zodiac_sign_Pisces,zodiac_sign_Sagittarius,zodiac_sign_Scorpio,zodiac_sign_Taurus,zodiac_sign_Virgo
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59394,1,21.0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50931,1,28.0,0,0,0,0,0,0,8,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28738,0,27.0,0,65,0,0,0,0,4,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55968,0,44.0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50686,1,39.0,0,0,0,0,0,0,6,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36115,1,27.0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19748,1,30.0,0,1,0,0,0,0,27,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46479,0,41.0,0,19,1,0,0,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15946,0,32.0,0,8,0,0,0,0,24,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14304,1,23.0,0,0,0,0,0,0,14,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Adding to pre-preocessing pipeline:

pipeline_preprocess.append(remove_outlier)

## 7. Create pre-processing pipeline

In [27]:
pipeline_preprocess.remove(dropnull)

In [28]:
print("\nSteps for pre-processing: ")
for step, function in enumerate(pipeline_preprocess):
    print("\t {:d}: {:s}".format(step, function.__name__))


Steps for pre-processing: 
	 0: dropnull
	 1: dropduplicated
	 2: dropcolumns
	 3: removedummy
	 4: remove_outlier


In [30]:
# Definition of preprocess_data for an specific dataset:

def preprocess_data_pipeline(df, pipeline_preprocess):
    for step, function in enumerate(pipeline_preprocess):
        df = function(df)
    print("Size of the dataset:  %d" % df.shape[0])
    print("Number of variables: %d" % df.shape[1])
    df.head(10)
    return df

Testing the preprocess_data_pipeline function for new data:

In [32]:
df_aux = pd.read_csv('../projectChurnRate/Data/churn_data.csv', index_col=0).sample(n=2100, random_state=0)
preprocess_data_pipeline(df_aux, pipeline_preprocess)

There are duplicated indexes....So removing duplicated indexes ...(3)
Drop app_web_user, deposit, ios_user, cc_recommended, cancelled_loan', 'received_loan', 'rejected_loan' columns ...(4)
Convert categorical values into numbers ...(5)
Remove columns housing_na, zodiac_sign_na, payment_type_na ...(6)
Removing outliers ...(7)
Size of the dataset:  2086
Number of variables: 36


Unnamed: 0_level_0,churn,age,withdrawal,purchases_partners,purchases,cc_taken,cc_disliked,cc_liked,cc_application_begin,app_downloaded,...,zodiac_sign_Cancer,zodiac_sign_Capricorn,zodiac_sign_Gemini,zodiac_sign_Leo,zodiac_sign_Libra,zodiac_sign_Pisces,zodiac_sign_Sagittarius,zodiac_sign_Scorpio,zodiac_sign_Taurus,zodiac_sign_Virgo
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50488,0,20.0,0,29,0,0,0,0,8,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53603,0,38.0,0,28,0,0,0,0,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42289,1,40.0,0,9,0,0,0,0,4,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4185,0,34.0,0,0,0,0,0,0,4,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12436,1,24.0,0,38,0,0,0,0,7,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23350,1,22.0,0,31,0,0,0,0,7,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53737,0,43.0,0,23,1,0,0,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56724,1,40.0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17737,1,23.0,0,8,0,0,0,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 4. Modeling

1. Separate dataset for X and y classes. 
2. Split dataset for training and testing.
3. Pipeline for transformation
    - Num-Val: Normalize data for training
    - Cat-Val: One hot encoding
4. Find the best parameters of the model
    - Use linear model
    - Use a tool
5. Modeling for finding the best:
    - GridSearchCV for finding the best parameters of each model
    - Model Calibration using best parameters



In [33]:
print("Size of the dataset:  %d" % df.shape[0])
print("Number of variables: %d" % df.shape[1])
df.head(10)

Size of the dataset:  20105
Number of variables: 36


Unnamed: 0_level_0,churn,age,withdrawal,purchases_partners,purchases,cc_taken,cc_disliked,cc_liked,cc_application_begin,app_downloaded,...,zodiac_sign_Cancer,zodiac_sign_Capricorn,zodiac_sign_Gemini,zodiac_sign_Leo,zodiac_sign_Libra,zodiac_sign_Pisces,zodiac_sign_Sagittarius,zodiac_sign_Scorpio,zodiac_sign_Taurus,zodiac_sign_Virgo
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59394,1,21.0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50931,1,28.0,0,0,0,0,0,0,8,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28738,0,27.0,0,65,0,0,0,0,4,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55968,0,44.0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50686,1,39.0,0,0,0,0,0,0,6,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36115,1,27.0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19748,1,30.0,0,1,0,0,0,0,27,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46479,0,41.0,0,19,1,0,0,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15946,0,32.0,0,8,0,0,0,0,24,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14304,1,23.0,0,0,0,0,0,0,14,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
X = df.drop('churn', axis = 1)
y = df['churn']
print("X shape: {}".format(X.shape))
print("y shape: {}".format(y.shape))


X shape: (20105, 35)
y shape: (20105,)


In [35]:
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)

In [36]:
num_features = ['age', 'withdrawal', 'purchases_partners', 'purchases',
       'cc_taken', 'cc_disliked', 'cc_liked', 'cc_application_begin',
       'app_downloaded', 'web_user', 'android_user', 'registered_phones',
       'waiting_4_loan', 'left_for_two_month_plus', 'left_for_one_month',
       'reward_rate', 'is_referred']

In [37]:
#
num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])

#

churn_trans = ColumnTransformer( transformers=[  ('num', num_transformer, num_features) ])


In [38]:

logr_model = LogisticRegression();

#

churn_pipe_logr = Pipeline(steps = [('prep', churn_trans), ('clas',logr_model)])


In [39]:
parameters = {}
parameters['prep__num__imputer__strategy'] = ['mean', 'median']
parameters['clas__C'] = [10e-3, 10e-2, 10e-1, 1, 10, 100, 1000]
parameters['clas__class_weight'] = [None, 'balanced']

#
# COMPLETAR
#	

GS = GridSearchCV(churn_pipe_logr, parameters , scoring = ['accuracy', 'precision', 'recall'], refit='precision', cv = 5)
GS.fit(x_train, y_train)

    

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['age',
                                                                          'withdrawal',
                                                                          'purchases_partners',
                                                                          'purchases',
                                                                          'cc_tak

In [40]:

print("Mejor score: ", GS.best_score_)
print("Mejor configuración de parámetros: ", GS.best_params_)

churn_pipe_logr = GS.best_estimator_

Mejor score:  0.5946739167570726
Mejor configuración de parámetros:  {'clas__C': 0.01, 'clas__class_weight': None, 'prep__num__imputer__strategy': 'mean'}
