In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_main=pd.read_csv('Churn_Modelling.csv')
df_main.set_index('CustomerId',inplace=True)
df_main.head()

Unnamed: 0_level_0,RowNumber,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
15634602,1,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
15647311,2,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
15619304,3,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
15701354,4,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
15737888,5,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
df_main=df_main.drop(['RowNumber','Surname'],axis=1)

### Train - Test Split

In [4]:
from sklearn.model_selection import train_test_split as tts
train, test = tts(df_main,test_size=0.2)

In [5]:
train['Exited'].value_counts(normalize=True)*100

0    79.7875
1    20.2125
Name: Exited, dtype: float64

In [6]:
test['Exited'].value_counts(normalize=True)*100

0    79.0
1    21.0
Name: Exited, dtype: float64

### Data Processing
    1) Assign Gender Null Values - '?'
    2) Label Encode - Gender, If Gender had null values: change 0 ('?') with np.nan
    3) Seperate the train into train1 - test1, train2 - test2, train3 - test3, train4 - test4 based on France, 
       Germany, Spain, NotKnown
    4) Null Values Impute - Itimp1, Itimp2, Itimp3, Itimp4
    5) Power Transform - Pt1, Pt2, Pt3, pt4
    

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import LabelEncoder, PowerTransformer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

In [8]:
class data_prep_1(BaseEstimator, TransformerMixin):
    def __init__(self,fill_value='?'):
        self.fill_value = fill_value
    def fit(self,df):
        return self
    def transform(self,df):
        df_gender_null_counter = df['Gender'].isnull().sum()
        if df_gender_null_counter>0:
            df_isgender_null = True
            df['Gender'].fillna(value = self.fill_value)
        return (df,df_gender_null_counter)

In [9]:
class data_prep_2(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,df,df_gender_null_counter):
        return self
    def transform(self,df,df_gender_null_counter=0):
        le = LabelEncoder()
        df.loc[:,'Gender'] = le.fit_transform(df['Gender'])
        if df_gender_null_counter>0:
            df.loc[:,'Gender'] = df['Gender'].replace({0:np.nan})
        return df

In [10]:
class data_prep_3(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self,X=None):
        return self
    
    def transform(self,train,X=None):
        
        trainF = train[train['Geography']=='France']
        

        trainG = train[train['Geography']=='Germany']
        

        trainS = train[train['Geography']=='Spain']
        
        trainNK = train.copy()
        
        trainF.drop(columns=['Geography'], inplace=True)
        trainG.drop(columns=['Geography'], inplace=True)
        trainS.drop(columns=['Geography'], inplace=True)
        trainNK.drop(columns=['Geography'], inplace=True)

        
        return (trainF,trainG,trainS,trainNK)

In [11]:
class data_prep_4(BaseEstimator, TransformerMixin):
    
    def __init__(self,num_bool):
        self.num_bool=num_bool
        
    
    def fit(self,X,y=None):
        if self.num_bool==True:
            self.est = RandomForestRegressor()
            self.itimp = IterativeImputer(self.est)
            self.cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                          'EstimatedSalary']
            
        else:
            self.est = RandomForestClassifier()
            self.itimp = IterativeImputer(self.est)
            self.cols =  ['Gender','HasCrCard','IsActiveMember']
            
        self.itimp.fit(X[self.cols])
        
        
        return self
        
    def transform(self,X,y=None):
        if X.shape[0] > 0:
            X.loc[:, self.cols] = self.itimp.transform(X[self.cols])
           
            return X
        else:
            return X

In [12]:
class data_prep_5(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self,df):
        self.cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                          'EstimatedSalary']  
        self.pt = PowerTransformer()
        self.pt.fit(df[self.cols])
        return self
        
    def transform(self,df):
        if df.shape[0] > 0:
            df.loc[:,self.cols] = self.pt.transform(df[self.cols])
            return df

In [13]:
class Combined_Data_Prep(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.test_bool = False
        self.country = str
        self.train = train.copy()
        self.test = test.copy()
        self.d1 = data_prep_1()
        self.d2 = data_prep_2()
        self.d3 = data_prep_3()
        
    
        self.d41F = data_prep_4(num_bool=True)
        self.d42F = data_prep_4(num_bool=False)
        
        self.d41G = data_prep_4(num_bool=True)
        self.d42G = data_prep_4(num_bool=False)
        
        self.d41S = data_prep_4(num_bool=True)
        self.d42S = data_prep_4(num_bool=False)
        
        self.d41NK = data_prep_4(num_bool=True)
        self.d42NK = data_prep_4(num_bool=False)
        
        self.d5F = data_prep_5()
        self.d5G = data_prep_5()
        self.d5S = data_prep_5()
        self.d5NK = data_prep_5()
        
    def fit(self,X,y=None):
        return self
        
        
    def transform(self,X,y=None):
        
        if self.test_bool==False:
            
            train,b = self.d1.fit_transform(X)
            
            train = self.d2.fit_transform(train,df_gender_null_counter=b)

            trainF,trainG,trainS,trainNK= self.d3.transform(train)

            trainF = self.d41F.fit_transform(trainF)        
            trainF = self.d42F.fit_transform(trainF)

            trainG = self.d41G.fit_transform(trainG)
            trainG = self.d42G.fit_transform(trainG)

            trainS = self.d41S.fit_transform(trainS)
            trainS = self.d42S.fit_transform(trainS)

            trainNK = self.d41NK.fit_transform(trainNK)
            trainNK = self.d42NK.fit_transform(trainNK)
            
            trainF = self.d5F.fit_transform(trainF)
            trainG = self.d5G.fit_transform(trainG)
            trainS = self.d5S.fit_transform(trainS)
            trainNK = self.d5NK.fit_transform(trainNK)

            return (trainF,trainG,trainS,trainNK)

        else:
            
            dptest,c = self.d1.fit_transform(X)
            
            dptest = self.d2.fit_transform(dptest,df_gender_null_counter=c)

            dptest.drop(columns=['Geography'],inplace=True)
            if self.country=='France':
                dptest = self.d41F.transform(dptest)
                dptest = self.d42F.transform(dptest)
                dptest = self.d5F.transform(dptest)
                return dptest
                
            elif self.country=='Germany':
                dptest = self.d41G.transform(dptest)
                dptest = self.d42G.transform(dptest)
                dptest = self.d5G.transform(dptest)
                return dptest

            elif self.country=='Spain':
                dptest = self.d41S.transform(dptest)
                dptest = self.d42S.transform(dptest)
                dptest = self.d5S.transform(dptest)
                return dptest

            elif self.country=='NK':
                dptest = self.d41NK.transform(dptest)
                dptest = self.d42NK.transform(dptest)
                dptest = self.d5NK.transform(dptest)
                return dptest

In [14]:
class model_build(BaseEstimator, TransformerMixin):
    
    def __init__(self,country):
        self.country = country
        if country=='France':
            self.params = {'num_leaves': 5,
                         'n_estimators': 138,
                         'min_child_samples': 60,
                         'max_depth': 3,
                         'learning_rate': 0.03}
        
        elif country=='Germany':
            self.params = {'num_leaves': 11,
                         'n_estimators': 185,
                         'min_child_samples': 55,
                         'max_depth': 2,
                         'learning_rate': 0.05}
        
        elif country=='Spain':
            self.params = {'num_leaves': 10,
                         'n_estimators': 121,
                         'min_child_samples': 30,
                         'max_depth': 14,
                         'learning_rate': 0.03}
        
        elif country=='NK':
            self.params = {'num_leaves': 38,
                         'n_estimators': 113,
                         'min_child_samples': 72,
                         'max_depth': 3,
                         'learning_rate': 0.09}
        else:
            print('Wrong Country!!')
        
        self.lgbc = LGBMClassifier(**self.params)
        
    def fit(self,X,y):
        self.lgbc.fit(X,y)
        return self
        
        
    def predict(self,X):
        if X.shape[0] > 0:
            ypred = self.lgbc.predict(X)
            
            return ypred
    
    def predict_proba(self,X):
        if X.shape[0]>0:
            yprob = self.lgbc.predict_proba(X)
            return yprob

In [15]:
data_prep = Combined_Data_Prep()
data_prep.test_bool = False
trainF,trainG,trainS,trainNK= data_prep.fit_transform(X=train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats

In [16]:
deployed_test = pd.DataFrame([[705,'France','Male',28,4,np.nan,np.nan,0,1,140000]], columns=train.columns.drop('Exited'))

In [17]:
deployed_test

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,705,France,Male,28,4,,,0,1,140000


In [18]:
data_prep.test_bool = True
data_prep.country='France'
deployed_test = data_prep.fit_transform(X=deployed_test)

In [19]:
deployed_test

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,0.55713,0.0,-1.131545,-0.275808,0.677315,-0.135467,0.0,1.0,0.728783


In [20]:
ytrainF = trainF['Exited']
xtrainF = trainF.drop(columns=['Exited'])


m_country = model_build(country='France')
m_country.fit(xtrainF,ytrainF)

model_build(country='France')

In [21]:
m_country.predict(deployed_test), m_country.predict_proba(deployed_test)[:,1]

(array([0], dtype=int64), array([0.11930657]))

In [22]:
df_main=pd.read_csv('Churn_Modelling.csv')
df_main.set_index('CustomerId',inplace=True)
df_main=df_main.drop(['RowNumber','Surname'],axis=1)
df_main.head()

Unnamed: 0_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [23]:
data_prep = Combined_Data_Prep()
data_prep.test_bool = False
trainF,trainG,trainS,trainNK= data_prep.fit_transform(X=df_main)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See t

In [24]:
import pickle
file = open('data_processing_custom_transformer.pkl','wb')
pickle.dump(data_prep,file)
file.close()

In [25]:
train.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [26]:
modelF = model_build('France')
modelG = model_build('Germany')
modelS = model_build('Spain')
modelNK = model_build('NK')

In [27]:
ytrainF = trainF['Exited']
xtrainF = trainF.drop(columns=['Exited'])

ytrainG = trainG['Exited']
xtrainG = trainG.drop(columns=['Exited'])

ytrainS = trainS['Exited']
xtrainS = trainS.drop(columns=['Exited'])

ytrainNK = trainNK['Exited']
xtrainNK = trainNK.drop(columns=['Exited'])

In [28]:
modelF.fit(xtrainF, ytrainF)
modelG.fit(xtrainG, ytrainG)
modelS.fit(xtrainS, ytrainS)
modelNK.fit(xtrainNK, ytrainNK)

model_build(country='NK')

In [29]:
import pickle
file = open('modelF.pkl','wb')
pickle.dump(modelF,file)
file.close()

In [30]:
import pickle
file = open('modelG.pkl','wb')
pickle.dump(modelG,file)
file.close()

In [31]:
import pickle
file = open('modelS.pkl','wb')
pickle.dump(modelS,file)
file.close()

In [32]:
import pickle
file = open('modelNK.pkl','wb')
pickle.dump(modelNK,file)
file.close()