# Scikit-learn - Feature Engineering

In this tutorial, we try to do extra feature engineering and figure out which features are important.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression

In [None]:
# load data
train_dat = pd.read_csv('titanic/train.csv')
test_dat = pd.read_csv('titanic/test.csv')

full_dat = pd.concat([train_dat, test_dat], sort = False)
full_dat.reset_index(drop = True, inplace = True)


## Feature engineering

In [None]:
full_dat.head()

In [None]:
full_dat.info()

In [None]:
for col in ['PassengerId', 'Age', 'Fare']:
    sns.violinplot(full_dat['Survived'], full_dat[col])
    plt.show()

In [None]:
for col in ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']:
    sns.barplot(full_dat[col], full_dat['Survived'])
    plt.show()

In [None]:
full_dat.Cabin.value_counts().head()

In [None]:
full_dat.Ticket.value_counts().head()

- categorical variable
    - **PassengerId** : delete
    - **Name** : delete
    - **Ticket** : delete

    - **Cabin** : select cabin title(alphabet) as categorical feature
    - **Pclass** : one-hot encoding
    - **Sex** : one-hot encoding
    - **Embarked** : one-hot encoding


- continuous variable

    - **Survived** : predictive variable
    - **Age** : impute missing value with mean age group by port-embarked, Pclass, and Sex
    - **Fare** : impute missing value with median of total Fare, and generate new feature called Fare-bin
    - **SibSp** : generate new feature by computing sum of SibSp and Parch
    - **Parch** : generate new feature by computing sum of SibSp and Parch


In [None]:
# missing imputation---#
full_dat['Embarked'].fillna(full_dat['Embarked'].mode()[0], inplace = True)
full_dat['Fare'].fillna(full_dat['Fare'].median(), inplace = True)

full_dat['Age'] = full_dat.groupby(['Pclass', 'Sex', 'Embarked'])['Age'].apply(lambda x: x.fillna(x.mean()))

In [None]:
# new feature : Family size
full_dat['Family_size'] = full_dat.SibSp+full_dat.Parch+1


# new feature : Fare_bin
full_dat['Fare_bin'] = pd.qcut(full_dat['Fare'], 5)


# new feature : Cabin group
full_dat['Cabin_group'] = full_dat.Cabin.fillna('Z').apply(lambda x: x[0])


In [None]:
# drop columns---#
full_dat.drop(['Name', 'Ticket', 'Cabin', 'PassengerId', 'Fare'], axis = 1, inplace = True)

In [None]:
#one-hot encoding---#
one_hot_dat = pd.get_dummies(full_dat, columns = ['Pclass','Sex','Embarked','Fare_bin','Cabin_group'])
one_hot_dat.head()


#normalization---#
std_s = StandardScaler()

survived_ = one_hot_dat['Survived']
one_hot_dat.drop('Survived', axis = 1, inplace = True)

normalize_dat = std_s.fit_transform(one_hot_dat)

In [None]:
#train test split---#
test_index = survived_.isna()

train_x = normalize_dat[~test_index]
test_x = normalize_dat[test_index]
train_y = survived_[~test_index]

t_x, v_x, t_y, v_y = train_test_split(train_x, train_y, test_size = 0.2, shuffle = True, random_state = 412)

## Build Model

In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(t_x, t_y)

print('training score (decision tree : {:.3f}'.format(dt_model.score(t_x, t_y)))
print('validation score (decision tree : {:.3f}'.format(dt_model.score(v_x, v_y)))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true = v_y, y_pred = dt_model.predict(v_x))

## Feature importance

In [None]:
for c,i in zip(one_hot_dat.columns, dt_model.feature_importances_):
    print('{}:{:.3f}'.format(c,i))

---

## Supervise learning 4.0

After the exmaple and practice, you should be able to

- know how create features / do feature engineering
- use feature importance on tree-based model to investigate which features are useful
