In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
PROJECT_DIR = r"C:\Users\win10\OneDrive\Desktop\CHURN_PREDICTIONS"
DATA_DIR = "Data"

In [3]:
def get_data(name):
    file_name = f"{name}.csv"
    file_path = os.path.join(PROJECT_DIR,DATA_DIR,file_name)
    return pd.read_csv(file_path)

In [4]:
Data = get_data("Churn_Modelling")

In [5]:
Data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [6]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [7]:
Data.duplicated().sum()

0

# cleaning data

In [8]:
def cleaning_data(df):
    return (
        df
        .drop(index=1)
        .assign(**{
            col:df[col].str.strip()
            for col in df.select_dtypes(include = "object").columns
        })
          
            .rename(columns = str.upper)
            .assign(
                Data = lambda df_:(
                    df_.GEOGRAPHY.str.replace('France', 'india')
                )
            )
            .drop(columns = ["ROWNUMBER", "CUSTOMERID"])
           )

In [9]:
cleaning_data(Data)

Unnamed: 0,SURNAME,CREDITSCORE,GEOGRAPHY,GENDER,AGE,TENURE,BALANCE,NUMOFPRODUCTS,HASCRCARD,ISACTIVEMEMBER,ESTIMATEDSALARY,EXITED,Data
0,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1,india
2,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1,india
3,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0,india
4,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0,Spain
5,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1,Spain
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0,india
9996,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0,india
9997,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1,india
9998,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1,Germany


In [10]:
Data.Geography.unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [11]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [12]:
(
    Data.Geography
    .str.replace('France', 'india')
)

0         india
1         Spain
2         india
3         india
4         Spain
         ...   
9995      india
9996      india
9997      india
9998    Germany
9999      india
Name: Geography, Length: 10000, dtype: object

In [13]:
Data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [14]:
Data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [15]:
(
  Data.Surname == 'Obijiaku'
)

0       False
1       False
2       False
3       False
4       False
        ...  
9995     True
9996    False
9997    False
9998    False
9999    False
Name: Surname, Length: 10000, dtype: bool

In [16]:
(
   Data.Geography.str.replace("France", "india")
)

0         india
1         Spain
2         india
3         india
4         Spain
         ...   
9995      india
9996      india
9997      india
9998    Germany
9999      india
Name: Geography, Length: 10000, dtype: object

In [17]:
X = Data.drop(['Exited'], axis = 1)
X.head(1)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88


In [18]:
Y = Data['Exited']
Y.head(1)

0    1
Name: Exited, dtype: int64

In [19]:
X_, X_test, y_, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_,y_,test_size=0.2,random_state=42)

In [20]:
print(X_train.shape , y_train.shape)
print(X_test.shape, y_test.shape)
print(X_val.shape,y_val.shape)

(6400, 13) (6400,)
(2000, 13) (2000,)
(1600, 13) (1600,)


In [27]:
def export_data(X,y, name):
    file_name = f"{name}.csv"
    file_path = os.path.join(PROJECT_DIR,DATA_DIR,file_name)
    
    X.join(y).to_csv(file_path,index = False)
    return pd.read_csv(file_path)

In [28]:
export_data(X_train, y_train , "train")

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,2580,15597896,Ozoemena,365,Germany,Male,30,0,127760.07,1,1,0,81537.85,1
1,7129,15690372,Henry,553,Spain,Male,38,1,181110.13,2,1,0,184544.59,0
2,4083,15669301,Romani,778,Germany,Female,29,6,150358.97,1,1,0,62454.01,1
3,2551,15784301,Wang,850,France,Male,42,0,0.00,2,1,0,44165.84,0
4,4927,15762927,Sung,674,Germany,Female,36,6,100762.64,1,1,0,182156.86,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6395,8872,15699524,Howells,466,France,Female,30,3,0.00,1,1,0,193984.60,0
6396,9827,15686917,Tu,789,Spain,Female,40,4,0.00,2,1,0,137402.27,0
6397,5269,15687648,Nicholson,691,France,Male,28,1,0.00,2,0,0,92865.41,0
6398,9667,15781689,Macadam,758,Spain,Male,35,5,0.00,2,1,0,95009.60,0


In [29]:
export_data(X_test, y_test , "test")

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,6253,15687492,Anderson,596,Germany,Male,32,3,96709.07,2,0,0,41788.37,0
1,4685,15736963,Herring,623,France,Male,43,1,0.00,2,1,1,146379.30,0
2,1732,15721730,Amechi,601,Spain,Female,44,4,0.00,2,1,0,58561.31,0
3,4743,15762134,Liang,506,Germany,Male,59,8,119152.10,2,1,1,170679.74,0
4,4522,15648898,Chuang,560,Spain,Female,27,7,124995.98,1,1,1,114669.79,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,6413,15598982,Klein,602,Germany,Female,53,5,98268.84,1,0,1,45038.29,1
1996,8286,15572631,Ndubuisi,609,France,Male,25,10,0.00,1,0,1,109895.16,0
1997,7854,15746674,Miller,730,France,Female,47,7,0.00,1,1,0,33373.26,1
1998,1096,15689081,Wu,692,France,Male,29,4,0.00,1,1,0,76755.99,1


In [30]:
export_data(X_val, y_val , "val")

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,3189,15586757,Anenechukwu,801,France,Female,32,4,75170.54,1,1,1,37898.50,0
1,8294,15770711,Lu,766,Germany,Female,28,4,90696.78,1,0,1,21597.20,0
2,1711,15567993,Palmer,828,Spain,Male,28,8,134766.85,1,1,0,79355.87,0
3,7511,15729377,Ku,798,France,Male,36,1,0.00,2,1,1,159044.10,0
4,1462,15640627,Wan,611,Spain,Male,34,4,0.00,2,1,0,170950.58,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,4129,15772148,Ferrari,639,Germany,Female,37,5,151242.48,1,0,1,49637.65,0
1596,6397,15602812,Holmes,684,Germany,Female,44,2,133776.86,2,0,1,49865.04,0
1597,9738,15741197,Calzada,710,Spain,Male,22,8,0.00,3,1,0,107292.91,0
1598,6722,15715638,Ch'ang,824,Germany,Male,77,3,27517.15,2,0,1,2746.41,0
