## Feature Engineering

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
# read raw dataset
df = pd.read_csv('Churn_Modelling.csv')

In [4]:
# methods selected only consider numerical features (not categorical)
# absence of male implies femal
gender_dummy = pd.get_dummies(df['Gender'],drop_first = True)
# absence of germany and spain does not imply france, so include dummy for all
geography_dummy = pd.get_dummies(df['Geography'])
df = pd.concat([df.drop('Gender',axis=1),gender_dummy],axis=1)
df = pd.concat([df.drop('Geography',axis=1),geography_dummy],axis=1)

In [5]:
# row not needed
df = df.drop('RowNumber', axis = 1)

In [6]:
# lowercase columns
df.columns = [x.lower() for x in df.columns]

In [7]:
# make columns easier to read
df = df.rename(columns={'customerid':'customer_id', 'creditscore':'credit_score', 
'numofproducts':'num_products', 'hascrcard':'has_credit_card', 
'isactivemember':'active_member', 'estimatedsalary':'estimated_salary'})

In [8]:
df.columns

Index(['customer_id', 'surname', 'credit_score', 'age', 'tenure', 'balance',
       'num_products', 'has_credit_card', 'active_member', 'estimated_salary',
       'exited', 'male', 'france', 'germany', 'spain'],
      dtype='object')

In [9]:
#Balance has a lot of zeros so lets create a column with Balance = 0
df['zero_balance'] = df['balance'].apply(lambda x:1 if x==0 else 0)

In [10]:
# Being under 25 or over 65 severely affects your Exit
df['age_bucket'] = df['age'].apply(lambda x:1 if ((x<25) or (x>65)) else 0)

In [11]:
## Derived Features

In [12]:
# Add buckets of salary
df['salary_buckets'] = pd.qcut(df['estimated_salary'], q = 4,labels={'salary_bucket_1','salary_bucket_2','salary_bucket_3','salary_bucket_4'})

# Add buckets of credit score
df['credit_score_buckets'] = pd.qcut(df['credit_score'], q = 4,labels={'cs_bucket_1','cs_bucket_2','cs_bucket_3','cs_bucket_4'})

In [13]:
salary_bucket_dummy = pd.get_dummies(df['salary_buckets'])
# absence of germany and spain does not imply france, so include dummy for all
cs_bucket_dummy = pd.get_dummies(df['credit_score_buckets'])
df = pd.concat([df.drop('salary_buckets',axis=1),salary_bucket_dummy],axis=1)
df = pd.concat([df.drop('credit_score_buckets',axis=1),cs_bucket_dummy],axis=1)

In [14]:
pd.set_option('display.max_columns', 500)
df.head()

Unnamed: 0,customer_id,surname,credit_score,age,tenure,balance,num_products,has_credit_card,active_member,estimated_salary,exited,male,france,germany,spain,zero_balance,age_bucket,salary_bucket_2,salary_bucket_1,salary_bucket_4,salary_bucket_3,cs_bucket_2,cs_bucket_4,cs_bucket_1,cs_bucket_3
0,15634602,Hargrave,619,42,2,0.0,1,1,1,101348.88,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0
1,15647311,Hill,608,41,1,83807.86,1,0,1,112542.58,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0
2,15619304,Onio,502,42,8,159660.8,3,1,0,113931.57,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0
3,15701354,Boni,699,39,1,0.0,2,0,0,93826.63,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0
4,15737888,Mitchell,850,43,2,125510.82,1,1,1,79084.1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1


In [15]:
df.to_csv('Churn_Modelling_Assortment.csv')