# Feature engineering

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [45]:
data_frame=pd.read_csv('test.csv')
data_frame.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [46]:
#getting the numerical and categorical features.
categorical_features=[]
numerical_features=[]
for cols in data_frame.columns:
    if data_frame[cols].dtype=='O':
        categorical_features.append(cols)
    else: 
        numerical_features.append(cols)

In [47]:
data_frame[categorical_features].head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Kelly, Mr. James",male,330911,,Q
1,"Wilkes, Mrs. James (Ellen Needs)",female,363272,,S
2,"Myles, Mr. Thomas Francis",male,240276,,Q
3,"Wirz, Mr. Albert",male,315154,,S
4,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,3101298,,S


In [48]:
data_frame[numerical_features].head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,892,3,34.5,0,0,7.8292
1,893,3,47.0,1,0,7.0
2,894,2,62.0,0,0,9.6875
3,895,3,27.0,0,0,8.6625
4,896,3,22.0,1,1,12.2875


In [49]:
#replacing null values in categorical features.
for cols in categorical_features:
    data_frame[cols].fillna('missing',inplace=True)

In [50]:
#replacing null values in numerical features.
for cols in numerical_features:
    median=data_frame[cols].median()
    data_frame[cols].fillna(median,inplace=True)

In [51]:
data_frame.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [52]:
#converting categorical features to numerical features.
for cols in categorical_features:
    rank=data_frame[cols].value_counts().index
    mapping={i:k for k,i in enumerate(rank,0)}
    data_frame[cols]=data_frame[cols].map(mapping)

In [53]:
data_frame[categorical_features].head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,108,0,95,0,2
1,172,1,260,0,0
2,170,0,122,0,2
3,57,0,150,0,0
4,310,1,159,0,0


In [54]:
#transforming continous variables of numerical features.
for cols in numerical_features:
    if len(data_frame[cols].value_counts())>25:
        if 0 in data_frame[cols].unique():
            pass
        else:
            if cols!='Age':
                data_frame[cols]=np.log(data_frame[cols])

In [55]:
data_frame[numerical_features].head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,6.793466,3,34.5,0,0,7.8292
1,6.794587,3,47.0,1,0,7.0
2,6.795706,2,62.0,0,0,9.6875
3,6.796824,3,27.0,0,0,8.6625
4,6.79794,3,22.0,1,1,12.2875


In [56]:
#removing passengerId
data_frame.drop(['PassengerId'],axis=1,inplace=True)

In [57]:
#transforming feature to minmax scaler.
from sklearn.preprocessing import MinMaxScaler

In [58]:
scaler=MinMaxScaler()

In [59]:
x=data_frame.iloc[:,:-1]
y=data_frame.iloc[:,-1]

In [60]:
scaler.fit(x)

MinMaxScaler()

In [61]:
x_data=pd.DataFrame(scaler.transform(x),columns=x.columns)

In [62]:
new_data=pd.concat([x_data,y.reset_index(drop=True)],axis=1)

In [63]:
new_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.258993,0.0,0.452723,0.0,0.0,0.262431,0.015282,0.0,2
1,1.0,0.41247,1.0,0.617566,0.125,0.0,0.718232,0.013663,0.0,0
2,0.5,0.407674,0.0,0.815377,0.0,0.0,0.337017,0.018909,0.0,2
3,1.0,0.136691,0.0,0.353818,0.0,0.0,0.414365,0.016908,0.0,0
4,1.0,0.743405,1.0,0.287881,0.125,0.111111,0.439227,0.023984,0.0,0


In [64]:
new_data.to_csv('new_test_data.csv',index=False)