# Content
* Min Max
* Binary
* Binning (Discretization) 
* Standardize
* Square / cube root 
* Log and Exp
* One-hot encoding
* 0-1

In [1]:
import numpy as np
import pandas as pd

### Lets import a data file to work in it

In [2]:
data = pd.read_csv('Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
data_str=[data.dtypes]
data_str

[RowNumber            int64
 CustomerId           int64
 Surname             object
 CreditScore          int64
 Geography           object
 Gender              object
 Age                  int64
 Tenure               int64
 Balance            float64
 NumOfProducts        int64
 HasCrCard            int64
 IsActiveMember       int64
 EstimatedSalary    float64
 Exited               int64
 dtype: object]

In [4]:
# Min Max transformation on Age
data['Age']=(data['Age']-data['Age'].min())/(data['Age'].max()-data['Age'].min())
data['Age'].head()

0    0.324324
1    0.310811
2    0.324324
3    0.283784
4    0.337838
Name: Age, dtype: float64

In [5]:
data.Balance=(data.Balance-data.Balance.min())/(data.Balance.max()-data.Balance.min())

In [6]:
data.Balance.head()

0    0.000000
1    0.334031
2    0.636357
3    0.000000
4    0.500246
Name: Balance, dtype: float64

In [7]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,0.324324,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,0.310811,1,0.334031,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,0.324324,8,0.636357,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,0.283784,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,0.337838,2,0.500246,1,1,1,79084.1,0


In [8]:
# Binarize on credit score
data['CreditScore'].describe()

count    10000.000000
mean       650.528800
std         96.653299
min        350.000000
25%        584.000000
50%        652.000000
75%        718.000000
max        850.000000
Name: CreditScore, dtype: float64

In [9]:
y=pd.Series([5,2,3,9,8,1])
print(y)
print(y[y>6])

0    5
1    2
2    3
3    9
4    8
5    1
dtype: int64
3    9
4    8
dtype: int64


In [10]:
data.shape[0]

10000

In [11]:
x=data['CreditScore']
CS_cat=np.repeat(0,data.shape[0])
CS_cat[x > x.median()]=1

In [12]:
print(x[:5])
print(CS_cat[:5])
print(x.median())

0    619
1    608
2    502
3    699
4    850
Name: CreditScore, dtype: int64
[0 0 0 1 1]
652.0


In [13]:
# Binning Balance
print(data['Balance'].describe())
print(np.quantile(data['Balance'],0.5))
print(data['Balance'][:10])

count    10000.000000
mean         0.304848
std          0.248696
min          0.000000
25%          0.000000
50%          0.387402
75%          0.508749
max          1.000000
Name: Balance, dtype: float64
0.3874024708597822
0    0.000000
1    0.334031
2    0.636357
3    0.000000
4    0.500246
5    0.453394
6    0.000000
7    0.458540
8    0.566170
9    0.536488
Name: Balance, dtype: float64


In [14]:
X=data['Balance']
Bal_bins=np.repeat("Poor",data.shape[0])
Bal_bins[(X > np.quantile(X,0.25)) & (X <= np.quantile(X,0.75))]="Avg"
Bal_bins[X > np.quantile(X,0.75)]="Best"
#Bal_bins.value_counts() if Bal_bins= pd.Series()
Bal_bins[:10]

array(['Poor', 'Avg', 'Best', 'Poor', 'Avg', 'Avg', 'Poor', 'Avg', 'Best',
       'Best'], dtype='<U4')

In [None]:
#y[y>6]="A"
#y

In [15]:
# Standardise EstimatedSalary
data.EstimatedSalary.describe()

count     10000.000000
mean     100090.239881
std       57510.492818
min          11.580000
25%       51002.110000
50%      100193.915000
75%      149388.247500
max      199992.480000
Name: EstimatedSalary, dtype: float64

In [16]:
(101348.88-100090.239881)/57510.492818

0.021885399643211997

In [17]:
std_est_sal=(data.EstimatedSalary-data.EstimatedSalary.mean())/data.EstimatedSalary.std()
std_est_sal[:10]

0    0.021885
1    0.216523
2    0.240675
3   -0.108912
4   -0.365258
5    0.863607
6   -1.565409
7    0.334837
8   -0.437307
9   -0.493206
Name: EstimatedSalary, dtype: float64

In [18]:
data['CreditScore']=data['CreditScore']**(1/3)
data['CreditScore'].head(6)

0    8.522432
1    8.471647
2    7.947574
3    8.874810
4    9.472682
5    8.640123
Name: CreditScore, dtype: float64

In [19]:
data['CreditScore']=np.log(data['CreditScore'])
data['CreditScore'][:10]

0    2.142702
1    2.136725
2    2.072867
3    2.183217
4    2.248412
5    2.156417
6    2.237247
7    1.976530
8    2.072202
9    2.175986
Name: CreditScore, dtype: float64

In [20]:
np.log(1000)

6.907755278982137

In [21]:
data['CreditScore']=np.exp(data['CreditScore'])
data['CreditScore'][:10]

0    8.522432
1    8.471647
2    7.947574
3    8.874810
4    9.472682
5    8.640123
6    9.367505
7    7.217652
8    7.942293
9    8.810868
Name: CreditScore, dtype: float64

In [22]:
print(data.dtypes)
num=data.columns[data.dtypes !='object']
print(num)

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore        float64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object
Index(['RowNumber', 'CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')


In [23]:
num=data.columns[data.dtypes !='object']
print(num)
d_num=data[num]
d_num.head()

Index(['RowNumber', 'CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')


Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,8.522432,0.324324,2,0.0,1,1,1,101348.88,1
1,2,15647311,8.471647,0.310811,1,0.334031,1,0,1,112542.58,0
2,3,15619304,7.947574,0.324324,8,0.636357,3,1,0,113931.57,1
3,4,15701354,8.87481,0.283784,1,0.0,2,0,0,93826.63,0
4,5,15737888,9.472682,0.337838,2,0.500246,1,1,1,79084.1,0


In [24]:
cat=data.columns[data.dtypes=='object']
print(cat)
cat_data=data[cat]
cat_data.head(5)

Index(['Surname', 'Geography', 'Gender'], dtype='object')


Unnamed: 0,Surname,Geography,Gender
0,Hargrave,France,Female
1,Hill,Spain,Female
2,Onio,France,Female
3,Boni,France,Female
4,Mitchell,Spain,Female


In [25]:
# Surname not required
del cat_data['Surname']

In [26]:
cat_data.head(10)

Unnamed: 0,Geography,Gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female
5,Spain,Male
6,France,Male
7,Germany,Female
8,France,Male
9,France,Male


In [28]:
# One-hot encoding
cat_data_ohe=pd.get_dummies(cat_data,columns=['Geography','Gender'],drop_first=True)
cat_data_ohe.head(5)

Unnamed: 0,Geography_Germany,Geography_Spain,Gender_Male
0,0,0,0
1,0,1,0
2,0,0,0
3,0,0,0
4,0,1,0


In [29]:
# Categorical to numeric coding
cat_data.dtypes

Geography    object
Gender       object
dtype: object

In [30]:
#Converting 'object' to 'category'
cat_data=cat_data.astype('category')
cat_data.dtypes

Geography    category
Gender       category
dtype: object

In [31]:
cat_data.Gender[0:10].cat.codes

0    0
1    0
2    0
3    0
4    0
5    1
6    1
7    0
8    1
9    1
dtype: int8

In [32]:
for i in cat_data.columns:
    print(i)
    cat_data[i] = cat_data[i].cat.codes
cat_data.head(10)

Geography
Gender


Unnamed: 0,Geography,Gender
0,0,0
1,2,0
2,0,0
3,0,0
4,2,0
5,2,1
6,0,1
7,1,0
8,0,1
9,0,1


In [34]:
#df1=pd.merge(d_num,cat_data,on=d_num.index)
#df1.head()
df1=d_num.join(cat_data_ohe,how='outer')
df1.head()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,1,15634602,8.522432,0.324324,2,0.0,1,1,1,101348.88,1,0,0,0
1,2,15647311,8.471647,0.310811,1,0.334031,1,0,1,112542.58,0,0,1,0
2,3,15619304,7.947574,0.324324,8,0.636357,3,1,0,113931.57,1,0,0,0
3,4,15701354,8.87481,0.283784,1,0.0,2,0,0,93826.63,0,0,0,0
4,5,15737888,9.472682,0.337838,2,0.500246,1,1,1,79084.1,0,0,1,0


In [None]:
df1.to_csv("Cleaned_data_churn.csv",index=False,header=True)