# Automate feature engineering process

# Define necessary functions and classes

In [1]:
# class for imputing missing values on each group
# group by key: groupby_columns
# column to be imputed: impute_column

# group_by_imputer = GroupByImputer(strategy="median")
# group_by_imputer.fit(data_train, groupby_columns=['Pclass', 'Sex'], impute_column='Age')
# data_train = group_by_imputer.transform(data_train)
# data_test = group_by_imputer.transform(data_test)

class GroupByImputer():
    def __init__(self, missing_values="NaN", strategy="mean", 
                 axis=0, verbose=0, copy=True):
        self.missing_values = missing_values
        self.strategy = strategy
        self.axis = axis
        self.verbose = verbose
        self.copy = copy
    
    def fit(self, X, groupby_columns, impute_column, y=None):
        
        self.group_by_imputers_ = X.groupby(groupby_columns).apply(lambda x: self._imputer_fit(x, impute_column, self.strategy))
        
        self.groupby_columns_ = groupby_columns
        self.impute_column_ = impute_column
        
        return self
    
    def _imputer_fit(self, x, impute_column, strategy):
        
        if data_train[impute_column].dtype == np.dtype('O'): 
            # object string
            imputer = x[impute_column].value_counts().index[0]
        else:
            # int or float
            imputer = preprocessing.Imputer(strategy=strategy)
            imputer.fit(x[[impute_column]])
        
        return imputer
    
    
    def transform(self, X):
        X = X.groupby(self.groupby_columns_).apply(lambda x: self._imputer_transform(x))
        return X
    
    def _imputer_transform(self, x):
        
        index = x.name
        imputer = self.group_by_imputers_[index]
        
        if data_train[self.impute_column_].dtype == np.dtype('O'): 
            # object string
            x[[self.impute_column_]] = x[[self.impute_column_]].fillna(imputer)
            
        else:
            # int or float
            x[[self.impute_column_]] = imputer.transform(x[[self.impute_column_]])
        
        return x

## 1. Start

### Import Modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import OrderedDict

from sklearn import preprocessing

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC, LinearSVC

import category_encoders as ce

from xgboost import XGBClassifier



### Read Data

In [3]:
data_train = pd.read_csv('./data/train.csv')
data_test = pd.read_csv('./data/test.csv')

In [4]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
data_train.describe(percentiles=[0.2,0.8], include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Porter, Mr. Walter Chamberlain",male,,,,347082.0,,C23 C25 C27,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
20%,179.0,0.0,1.0,,,19.0,0.0,0.0,,7.8542,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
80%,713.0,1.0,3.0,,,41.0,1.0,1.0,,39.6875,,


## 2. Preprocessing

### Missing Values

In [6]:
## TODO: MICE implementation

In [7]:
# columns with missing values
columns_with_missing_values = list(data_train.columns[data_train.isnull().any()])
# number of missing values on each column
print(data_train[columns_with_missing_values].isnull().sum())

Age         177
Cabin       687
Embarked      2
dtype: int64


In [8]:
# Impute missing values with median value on each (Pclass, Sex) group for Age column
age_group_by_imputer = GroupByImputer(strategy="median")
age_group_by_imputer.fit(data_train, groupby_columns=['Pclass', 'Sex'], impute_column='Age')
data_train = age_group_by_imputer.transform(data_train)

In [9]:
# Impute missing values with median value on each (Pclass, Embarked) group for Fare column
fare_group_by_imputer = GroupByImputer(strategy="median")
fare_group_by_imputer.fit(data_train, groupby_columns=['Pclass', 'Embarked'], impute_column='Fare')

<__main__.GroupByImputer at 0x7f9af3f37550>

In [10]:
# Impute missing values for Embarked column
# Hypothesis: Fare is positive correlated with Pclass and Embarked

In [11]:
data_train[['Pclass', 'Embarked', 'Fare']].groupby(['Pclass', 'Embarked']).median()

Unnamed: 0_level_0,Unnamed: 1_level_0,Fare
Pclass,Embarked,Unnamed: 2_level_1
1,C,78.2667
1,Q,90.0
1,S,52.0
2,C,24.0
2,Q,12.35
2,S,13.5
3,C,7.8958
3,Q,7.75
3,S,8.05


In [12]:
data_train.loc[data_train['Embarked'].isnull(), :]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [13]:
# For Pclass 1, Fare 80 is nearest to Fare 78.2667, which Embarked is C
data_train.loc[data_train['Embarked'].isnull(), 'Embarked'] = 'C'

In [14]:
# # Handling on Cabin: 0 for null and 1 for not null
# data_train['Cabin_flg'] = 0
# data_train.loc[data_train.Cabin.notnull(), 'Cabin_flg'] = 1

### Feature Transformation

In [15]:
# # Convert SibSp with more than 2 into 'more_than_2'
# data_train['SibSp_flg'] = data_train['SibSp']
# data_train.loc[data_train['SibSp'] >= 2, 'SibSp_flg'] = 'more_than_2'
# data_train['SibSp_flg'] = data_train['SibSp_flg'].astype(str)

In [16]:
# # Convert Parch with more than 2 into 'more_than_2'
# data_train['Parch_flg'] = data_train['Parch']
# data_train.loc[data_train['Parch'] >= 2, 'Parch_flg'] = 'more_than_2'
# data_train['Parch_flg'] = data_train['Parch_flg'].astype(str)

### Feature Generation

In [17]:
# Extract Title from Name
def generate_title(data, column='Name', new_column='Title', main_levels=['Mr', 'Master', 'Miss', 'Mrs']):
    # extract title from name
    data[new_column] = data[column].str.extract(',\s*([^\.]*)\s*\.', expand=False)
    # Mr: Mr; Master: Master; Miss: Miss; Mrs: Mrs; Others: Rare
    data[new_column] = data[new_column].str.replace('|'.join(list(set(data[new_column].unique()) - set(main_levels))), 'Rare')
    return data

data_train = generate_title(data_train)

In [18]:
# Combine SibSp and Parch and passenger as Fsize(Family Size)
data_train['Fsize'] = data_train['SibSp'] + data_train['Parch'] + 1

In [19]:
# Create new feature to divide Age into 'child' and 'adult'
data_train['Age_New'] = 'adult'
data_train.loc[data_train['Age'] < 18, 'Age_New'] = 'child'

### Preparation for Modeling

In [20]:
# Feature list
numerical_variables = ['Fsize', 'Fare']
categorical_variables = ['Pclass', 'Sex', 'Embarked', 'Title', 'Age_New']
# features = numerical_variables + categorical_variables

In [21]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
Title          891 non-null object
Fsize          891 non-null int64
Age_New        891 non-null object
dtypes: float64(2), int64(6), object(7)
memory usage: 104.5+ KB


In [22]:
# OneHotEncoder
ohe = ce.one_hot.OneHotEncoder(cols=categorical_variables)
ohe.fit(data_train)
# Transform training data
data_train = ohe.transform(data_train)
# Remove columns with name containing '-1'(all 0)
data_train = data_train[[c for c in data_train.columns if '-1' not in c]]

### Train Models

In [23]:
# Dummy categorical variables created by OneHotEncoding
dummy_categorical_variables = list()
for categorical_variable in categorical_variables:
    dummy_categorical_variables = dummy_categorical_variables + [c for c in data_train.columns if categorical_variable in c]

In [24]:
X_train = data_train[dummy_categorical_variables + numerical_variables]
y_train = data_train['Survived']

In [25]:
classifiers = OrderedDict()
classifiers['Logistic Regression'] = LogisticRegression()
classifiers['Decision Tree'] = DecisionTreeClassifier()
classifiers['Random Forest'] = RandomForestClassifier()
classifiers['AdaBoost'] = AdaBoostClassifier()
classifiers['Gradient Boosting'] = GradientBoostingClassifier()
classifiers['Naive Bayes'] = GaussianNB()
classifiers['XGBoost'] = XGBClassifier()
# 各学習器をCVでパフォーマンスを出力
cv_result_df = pd.DataFrame(columns=['classifier', 'cv_scores_mean'])
for clf_name, classifier in classifiers.items():
    cv_scores = cross_val_score(classifier, X_train, y_train, cv=5)
    cv_result_df = cv_result_df.append({cv_result_df.columns.values[0]: clf_name,\
                                        cv_result_df.columns.values[1]: cv_scores.mean()},\
                                       ignore_index=True)

In [26]:
cv_result_df

Unnamed: 0,classifier,cv_scores_mean
0,Logistic Regression,0.824943
1,Decision Tree,0.802521
2,Random Forest,0.804812
3,AdaBoost,0.818233
4,Gradient Boosting,0.824981
5,Naive Bayes,0.78229
6,XGBoost,0.807016


In [27]:
# RandomForest
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

### Parameter Tuning

### Transformation on Test Data

In [28]:
# Append new column of tartget column
data_test['Survived'] = 0

In [29]:
# Impute missing values with median value on each (Pclass, Sex) group for Age column
data_test = age_group_by_imputer.transform(data_test)

In [30]:
# Impute missing values with median value on each (Pclass, Embarked) group for Fare column
data_test = fare_group_by_imputer.transform(data_test)

In [31]:
# Combine SibSp and Parch and passenger as Fsize(Family Size)
data_test['Fsize'] = data_test['SibSp'] + data_test['Parch'] + 1

In [32]:
# Extract Title from Name
data_test = generate_title(data_test)

In [33]:
# Create new feature to divide Age into 'child' and 'adult'
data_test['Age_New'] = 'adult'
data_test.loc[data_test['Age'] < 18, 'Age_New'] = 'child'

In [34]:
# Transform test data
data_test = ohe.transform(data_test)
# Remove columns with name containing '-1'(all 0)
data_test = data_test[[c for c in data_test.columns if '-1' not in c]]

In [31]:
# for categorical_column in categorical_variables:
#     lb = LabelBinarizerDict[categorical_column]
#     
#     # dummy dataframe 
#     dummy_df = pd.DataFrame(lb.transform(data_test[categorical_column]), index=None)
#     
#     # column names of dummy variables
#     dummy_column_names = [categorical_column + '_' + str(lb_class) for lb_class in list(lb.classes_)]
#     if len(dummy_column_names) == 2:
#         dummy_column_names = [dummy_column_names[0]]
#     # assign column names to dummy dataframe
#     dummy_df.columns = dummy_column_names
#     
#     data_test = pd.concat([data_test, dummy_df], axis=1)

In [35]:
X_test = data_test[dummy_categorical_variables + numerical_variables]

In [36]:
# random forest
y_test = clf.predict(X_test)

In [76]:
## submit the result
pd.DataFrame({'PassengerId':data_test.PassengerId, 'survived':y_test}).to_csv('solution.csv', header=True, index=False)

In [74]:
group_by_imputer = GroupByImputer(strategy="median")
group_by_imputer.fit(data_train, groupby_columns=['Pclass', 'Sex'], impute_column='Age')

<__main__.GroupByImputer at 0x7f14e2286208>

In [76]:
group_by_imputer.group_by_imputers_[1,'female'].statistics_

array([ 35.])

In [78]:
group_by_imputer.group_by_imputers_

Pclass  Sex   
1       female    Imputer(axis=0, copy=True, missing_values='NaN...
        male      Imputer(axis=0, copy=True, missing_values='NaN...
2       female    Imputer(axis=0, copy=True, missing_values='NaN...
        male      Imputer(axis=0, copy=True, missing_values='NaN...
3       female    Imputer(axis=0, copy=True, missing_values='NaN...
        male      Imputer(axis=0, copy=True, missing_values='NaN...
dtype: object

In [79]:
group_by_imputer.groupby_columns

['Pclass', 'Sex']

In [80]:
a = group_by_imputer.transform(data_train)

In [81]:
a.loc[data_train.Age.isnull(), ].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
5,6,0,3,"Moran, Mr. James",male,25.0,0,0,330877,8.4583,,Q,Mr
17,18,1,2,"Williams, Mr. Charles Eugene",male,30.0,0,0,244373,13.0,,S,Mr
19,20,1,3,"Masselmani, Mrs. Fatima",female,21.5,0,0,2649,7.225,,C,Mrs
26,27,0,3,"Emir, Mr. Farred Chehab",male,25.0,0,0,2631,7.225,,C,Mr
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,21.5,0,0,330959,7.8792,,Q,Miss


In [82]:
data_train.loc[data_train.Age.isnull(), ].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S,Mr
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C,Mrs
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C,Mr
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,Miss


In [82]:
def imputer_fit(x, impute_column):
    imputer = preprocessing.Imputer(strategy='median')
    return imputer.fit(x[[impute_column]])

In [83]:
groupby_columns = ['Pclass', 'Sex']
impute_column = 'Age'
data = data_train

In [84]:
imputers = data.groupby(groupby_columns).apply(lambda x: imputer_fit(x, impute_column))

In [85]:
def imputer_transform(x, imputers, impute_column):
    
    index = x.name
    
    imputer = imputers[index]
    x[[impute_column]] = imputer.transform(x[[impute_column]])
    
    return x

In [86]:
a = data_test.groupby(groupby_columns).apply(lambda x: imputer_transform(x, imputers, impute_column))

In [94]:
imputers.loc[3, 'male'].statistics_

array([ 25.])

In [89]:
a.loc[data_test['Age'].isnull(), 'Age']

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
10,902,3,"Ilieff, Mr. Ylio",male,25.0,0,0,349220,7.8958,,S
22,914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,35.0,0,0,PC 17598,31.6833,,S
29,921,3,"Samaan, Mr. Elias",male,25.0,2,0,2662,21.6792,,C
33,925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,21.5,1,2,W./C. 6607,23.4500,,S
36,928,3,"Roth, Miss. Sarah A",female,21.5,0,0,342712,8.0500,,S
39,931,3,"Hee, Mr. Ling",male,25.0,0,0,1601,56.4958,,S
41,933,1,"Franklin, Mr. Thomas Parham",male,40.0,0,0,113778,26.5500,D34,S
47,939,3,"Shaughnessy, Mr. Patrick",male,25.0,0,0,370374,7.7500,,Q
54,946,2,"Mangiavacchi, Mr. Serafino Emilio",male,30.0,0,0,SC/A.3 2861,15.5792,,C
58,950,3,"Davison, Mr. Thomas Henry",male,25.0,1,0,386525,16.1000,,S


In [135]:
data_test.loc[data_test['Age'].isnull(), ]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
10,902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S
22,914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
29,921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C
33,925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.4500,,S
36,928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.0500,,S
39,931,3,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S
41,933,1,"Franklin, Mr. Thomas Parham",male,,0,0,113778,26.5500,D34,S
47,939,3,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.7500,,Q
54,946,2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,SC/A.3 2861,15.5792,,C
58,950,3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1000,,S


In [136]:
c = data_test.copy()

In [137]:
c.loc[c['Age'].isnull(), 'Age'] = c.loc[c['Age'].isnull(),  ].apply(lambda x: imputers.loc[tuple(x[groupby_columns])].statistics_[0], axis = 1)

In [139]:
a.equals(data_test)

False

[3, 'female']

Pclass         3
Sex       female
Name: 1, dtype: object

In [29]:
b = a.loc[1, 'male']

In [17]:
imputers.loc[1, 'male'].statistics_

array([ 41.28138614])

In [9]:
a = data.groupby(groupby_columns)

In [10]:
a.groups.keys()

dict_keys([(1, 'female'), (1, 'male'), (2, 'female'), (2, 'male'), (3, 'female'), (3, 'male')])

In [60]:
def imputer_fit(x, impute_column):
    imputer = preprocessing.Imputer()
    return imputer.fit(x[[impute_column]])

In [None]:
def groupby_imputer_fit(data, groupby_columns, impute_column, strategy='median'):
    

In [61]:
b = a.apply(lambda x: imputer_fit(x, impute_column))

In [65]:
b.loc[1, 'male'].statistics_

array([ 41.28138614])

In [66]:
b.loc[2, 'male'].statistics_

array([ 30.74070707])

In [38]:
b.index

MultiIndex(levels=[[1, 2, 3], ['female', 'male']],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
           names=['Pclass', 'Sex'])

In [55]:
c = b.loc[2,'female']

In [57]:
len(c.statistics_)

347

In [None]:
df.loc['bar', 'two']

In [23]:
type(b)

pandas.core.series.Series

In [26]:
b.index

MultiIndex(levels=[[1, 2, 3], ['female', 'male']],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
           names=['Pclass', 'Sex'])

In [49]:
a.apply(lambda x:c.transform(x[impute_column]))



ValueError: X has 94 features per sample, expected 347

In [None]:
# Impute missing values by group
def impute_on_grouped_data(data, groupby_columns, column, strategy='median'):
    
    imputer = preprocessing.Imputer(missing_values='NaN', strategy=strategy, axis=0)
    
    
    
    # extract title from name
    data[new_column] = data[column].str.extract(',\s*([^\.]*)\s*\.', expand=False)
    # Mr: Mr; Master: Master; Miss: Miss; Mrs: Mrs; Others: Rare
    data[new_column] = data[new_column].str.replace('|'.join(list(set(data[new_column].unique()) - set(main_levels))), 'Rare')
    return data

data_train = generate_title(data_train)

In [None]:
imputer = preprocessing.Imputer(missing_values='NaN', strategy=strategy, axis=0)

In [61]:
imputer = preprocessing.Imputer(missing_values='NaN', strategy='median', axis=0)

In [63]:
imputer.fit(data_train[['Age']])

Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

In [None]:
imputer.transform(data_train[['Age']])

In [69]:
imputer.statistics_

array([ 28.])

In [71]:
imputer.missing_values

'NaN'

In [60]:
data_train.groupby(['Pclass'])

<pandas.core.groupby.DataFrameGroupBy object at 0x7f01263a2198>

In [None]:
import numpy as np
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit([[1, 2], [np.nan, 3], [7, 6]])

X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))     

In [None]:
# Define a function(fill na with aggregation result)
def fillna_with_agg_result(grouped_data, column_name, agg_method):
    if agg_method == 'median':
        grouped_data[column_name].fillna(grouped_data[column_name].median(), inplace=True)
    if agg_method == 'mean':
        grouped_data[column_name].fillna(grouped_data[column_name].mean(), inplace=True)
    elif agg_method == 'mode':
        grouped_data[column_name].fillna(grouped_data[column_name].mode()[0], inplace=True)
    return grouped_data

In [30]:
data_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [29]:
# 1. extract title from name
data_train['Title'] = data_train.Name.str.extract(',\s*([^\.]*)\s*\.', expand=False)
# Mr: Mr
# Master: Master
# Miss: Miss
# Mrs: Mrs
# Others: Rare
data_train['Title'] = data_train['Title'].str.replace('|'.join(list(set(data_train['Title'].unique()) - set(['Mr', 'Master', 'Miss', 'Mrs']))), 'Rare')

In [32]:
# Define a function(fill na with aggregation result)
def fillna_with_agg_result(grouped_data, column_name, agg_method):
    if agg_method == 'median':
        grouped_data[column_name].fillna(grouped_data[column_name].median(), inplace=True)
    if agg_method == 'mean':
        grouped_data[column_name].fillna(grouped_data[column_name].mean(), inplace=True)
    elif agg_method == 'mode':
        grouped_data[column_name].fillna(grouped_data[column_name].mode()[0], inplace=True)
    return grouped_data

In [33]:
# 2. Fill missing values on Age
# With median value of each group(Pclass, Sex, Title)

# Apply the function on each group
data_train = data_train.groupby(['Pclass', 'Sex', 'Title'], as_index=False).\
apply(lambda grouped_data:fillna_with_agg_result(grouped_data, column_name='Age', agg_method='median'))

In [34]:
# 3. Fill missing values on Embarked
# With mode value
data_train = fillna_with_agg_result(data_train, 'Embarked', 'mode')

In [35]:
# 4. Fill missing values on Fare
# With median value of each group(Pclass)

# Apply the function on each group
data_train = data_train.groupby(['Pclass'], as_index=False).\
apply(lambda grouped_data:fillna_with_agg_result(grouped_data, column_name='Fare', agg_method='median'))

In [36]:
# 5. Handling on Cabin: 0 for null and 1 for not null
data_train['Cabin_flg'] = 0
data_train.loc[data_train.Cabin.notnull(), 'Cabin_flg'] = 1

In [37]:
# 6. Handling on Ticket

In [38]:
# 7. Convert SibSp with more than 2 into 'more_than_2'
data_train['SibSp_flg'] = data_train['SibSp']
data_train.loc[data_train['SibSp'] >= 2, 'SibSp_flg'] = 'more_than_2'
data_train['SibSp_flg'] = data_train['SibSp_flg'].astype(str)

In [39]:
# 8. Convert Parch with more than 2 into 'more_than_2'
data_train['Parch_flg'] = data_train['Parch']
data_train.loc[data_train['Parch'] >= 2, 'Parch_flg'] = 'more_than_2'
data_train['Parch_flg'] = data_train['Parch_flg'].astype(str)

## Preparation for modeling

In [40]:
# Feature list
numerical_variables = ['Age', 'Fare']
categorical_variables = ['Pclass', 'Sex', 'Embarked', 'Title', 'Cabin_flg', 'SibSp_flg', 'Parch_flg']
features = numerical_variables + categorical_variables

In [41]:
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,dataset_flg,Title,Cabin_flg,SibSp_flg,Parch_flg
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,Mr,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,Mrs,1,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,Miss,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,Mrs,1,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,Mr,0,0,0


In [42]:
LabelBinarizerDict = dict()
dummy_categorical_variables = list()
for categorical_column in categorical_variables:
    lb = preprocessing.LabelBinarizer()
    # fit data
    lb.fit(data_train[categorical_column])
    # dummy dataframe
    dummy_df = pd.DataFrame(lb.transform(data_train[categorical_column]), index=None)
    
    # column names of dummy variables
    dummy_column_names = [categorical_column + '_' + str(lb_class) for lb_class in list(lb.classes_)]
    if len(dummy_column_names) == 2:
        dummy_column_names = [dummy_column_names[0]]
    # assign column names to dummy dataframe
    dummy_df.columns = dummy_column_names
    
    dummy_categorical_variables = dummy_categorical_variables + dummy_column_names
    
    data_train = pd.concat([data_train, dummy_df], axis=1)
    
    LabelBinarizerDict[categorical_column] = lb

## train models

In [43]:
X_train = data_train[dummy_categorical_variables + numerical_variables]
y_train = data_train['Survived']

In [44]:
classifiers = OrderedDict()
classifiers['Logistic Regression'] = LogisticRegression()
classifiers['Decision Tree'] = DecisionTreeClassifier()
classifiers['Random Forest'] = RandomForestClassifier()
classifiers['AdaBoost'] = AdaBoostClassifier()
classifiers['Gradient Boosting'] = GradientBoostingClassifier()
classifiers['Naive Bayes'] = GaussianNB()
classifiers['XGBoost'] = XGBClassifier()
# 各学習器をCVでパフォーマンスを出力
cv_result_df = pd.DataFrame(columns=['classifier', 'cv_scores_mean'])
for clf_name, classifier in classifiers.items():
    cv_scores = cross_val_score(classifier, X_train, y_train, cv=5)
    cv_result_df = cv_result_df.append({cv_result_df.columns.values[0]: clf_name,\
                                        cv_result_df.columns.values[1]: cv_scores.mean()},\
                                       ignore_index=True)

In [45]:
cv_result_df

Unnamed: 0,classifier,cv_scores_mean
0,Logistic Regression,0.820474
1,Decision Tree,0.787902
2,Random Forest,0.809269
3,AdaBoost,0.809282
4,Gradient Boosting,0.832815
5,Naive Bayes,0.754306
6,XGBoost,0.827209


In [58]:
# XGBoost
clf = XGBClassifier()
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

## parameter tuning

## predict on test data

In [46]:
# 1. extract title from name
data_test['Title'] = data_test.Name.str.extract(',\s*([^\.]*)\s*\.', expand=False)
# Mr: Mr
# Master: Master
# Miss: Miss
# Mrs: Mrs
# Others: Rare
data_test['Title'] = data_test['Title'].str.replace('|'.join(list(set(data_test['Title'].unique()) - set(['Mr', 'Master', 'Miss', 'Mrs']))), 'Rare')

In [47]:
# 5. Handling on Cabin: 0 for null and 1 for not null
data_test['Cabin_flg'] = 0
data_test.loc[data_test.Cabin.notnull(), 'Cabin_flg'] = 1

In [48]:
# 7. Convert SibSp with more than 2 into 'more_than_2'
data_test['SibSp_flg'] = data_test['SibSp']
data_test.loc[data_test['SibSp'] >= 2, 'SibSp_flg'] = 'more_than_2'
data_test['SibSp_flg'] = data_test['SibSp_flg'].astype(str)

In [49]:
# 8. Convert Parch with more than 2 into 'more_than_2'
data_test['Parch_flg'] = data_test['Parch']
data_test.loc[data_test['Parch'] >= 2, 'Parch_flg'] = 'more_than_2'
data_test['Parch_flg'] = data_test['Parch_flg'].astype(str)

In [54]:
LabelBinarizerDict = dict()
dummy_categorical_variables = list()
for categorical_column in categorical_variables:
    lb = preprocessing.LabelBinarizer()
    # fit data
    lb.fit(data_test[categorical_column])
    # dummy dataframe
    dummy_df = pd.DataFrame(lb.transform(data_test[categorical_column]), index=None)
    
    # column names of dummy variables
    dummy_column_names = [categorical_column + '_' + str(lb_class) for lb_class in list(lb.classes_)]
    if len(dummy_column_names) == 2:
        dummy_column_names = [dummy_column_names[0]]
    # assign column names to dummy dataframe
    dummy_df.columns = dummy_column_names
    
    dummy_categorical_variables = dummy_categorical_variables + dummy_column_names
    
    data_test = pd.concat([data_test, dummy_df], axis=1)
    
    LabelBinarizerDict[categorical_column] = lb

In [57]:
X_test = data_test[dummy_categorical_variables + numerical_variables]

In [62]:
# XGBoost
y_test = clf.predict(X_test)

In [70]:
## submit the result
pd.DataFrame({'PassengerId':data_test.PassengerId, 'survived':y_test}).to_csv('solution.csv', header=True, index=False)