In [0]:
!pip install optuna
!pip install featuretools
!pip install heamy

In [0]:
!unzip titanic.zip

In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier , AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC


In [0]:

'''
データのロード
'''
train = pd.read_csv("train.csv")      # (891, 12)
test = pd.read_csv("test.csv")        # (418, 11)

full_data = pd.concat([train,test])

train_len = len(train)      # 891
test_len = len(test)        # 418

In [0]:
def missing_table(df):      # データの欠損を計算
    null_val = df.isnull().sum()
    percent = 100 * df.isnull().sum()/len(df)
    missing_table = pd.concat([null_val, percent], axis=1)
    missing_table_ren_columns = missing_table.rename(
        columns = {0:"欠損数", 1:"%"}
    )
    return missing_table_ren_columns

# print(missing_table(train))
# print(missing_table(test))

In [0]:
train.nunique()

In [0]:
'''
データの前処理
'''
# 欠損値の補正
test["Fare"] = test["Fare"].fillna(test["Fare"].median())
train['Embarked'] = train['Embarked'].fillna('S')
# full_data["Fare"] = full_data["Fare"].fillna(full_data["Fare"].median())
# full_data['Embarked'] = full_data['Embarked'].fillna('S')

# 名前の敬称を抽出
for i in range(len(train)):
  train["Name"][i] = train["Name"][i].split('.')[0].split(', ')[1]
for i in range(len(test)):
  test["Name"][i] = test["Name"][i].split('.')[0].split(', ')[1]
# for i in range(len(full_data)):
#   full_data["Name"][i] = full_data["Name"][i].split('.')[0].split(', ')[1]
full_data = pd.concat([train,test])

# 家族数を計算
# train['family'] = train['SibSp'] + train['Parch']
# test['family'] = test['SibSp'] + test['Parch']
full_data['family'] = full_data['SibSp'] + full_data['Parch']

# ダミーデータを生成
def add_dummy(df):
    df['Pclass'] = df['Pclass'].astype(np.str)
    temp = pd.get_dummies(df[['Sex','Embarked','Pclass','Name']], drop_first = False)
    temp['PassengerId'] = df['PassengerId']
    return pd.merge(df, temp)
# train = add_dummy(train)
# test = add_dummy(test)
full_data = add_dummy(full_data)

In [0]:
x_train_map = full_data[:len(train)].drop(columns=['PassengerId','Name','Sex', 'Ticket','Embarked','Age','Cabin','Pclass'])
x_train = full_data[:len(train)].drop(columns=['Survived', 'PassengerId','Name','Sex', 'Ticket','Embarked','Age','Cabin','Pclass'])
x_test = full_data[len(train):].drop(columns=['Survived','PassengerId','Name','Sex', 'Ticket','Embarked','Age','Cabin','Pclass'])
y_train = full_data[:len(train)]['Survived']

# x_train_demo = full_data.loc[:,[
#     'family',
#     'Fare',
#     'Sex_female',
#     'Sex_male',
#     'Pclass_1',
#     'Pclass_3',
#     'Name_Miss',
#     'Name_Mr'
# ]]
# x_train_demo = x_train_demo[:len(train)]
# x_test_demo = full_data.loc[:,[
#     'family',
#     'Fare',
#     'Sex_female',
#     'Sex_male',
#     'Pclass_1',
#     'Pclass_3',
#     'Name_Miss',
#     'Name_Mr'
# ]]
# x_test_demo = x_test_demo[len(train):]

In [0]:
corrmat = x_train_map.corr()
f, ax = plt.subplots(figsize=(20,12))
sns.heatmap(corrmat, vmax=.8, annot = True, center = 0)

In [0]:
'''
特徴量の選択
'''
from boruta_py import BorutaPy

model = RandomForestClassifier(max_depth=5)
feat_selector = BorutaPy(
    model,
    n_estimators='auto',
    two_step= True,
    verbose=2,
    random_state=42,
    perc=80,
    max_iter=100
)

In [0]:
feat_selector.fit(x_train.values, y_train.values)

In [0]:
x_train_selected = x_train.iloc[:,feat_selector.support_]
x_test_selected = x_test.iloc[:,feat_selector.support_]

In [0]:
# k分割交差検証
from sklearn.model_selection import cross_val_score

def kfold(model, x, y, cv):
  data_s = pd.concat([x, y], axis=1)
  data_s = data_s.sample(frac=1)
  x_s = data_s.drop(columns=['Survived'])
  y_s = data_s['Survived'].as_matrix()
  scores = cross_val_score(model, x_s, y_s, cv=cv)
  print('Cross-Validation scores: {}'.format(scores))
  print('Average score: {}'.format(np.mean(scores)))
  return float(format(np.mean(scores)))

In [254]:
'''
モデルの構築(Stacking)
'''
from heamy.dataset import Dataset
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline
from sklearn.linear_model import LinearRegression

# data_s = pd.concat([x_train_selected, y_train], axis=1)
data_s = pd.concat([x_train, y_train], axis=1)
data_s = data_s.sample(frac=1)
x_s = data_s.drop(columns=['Survived'])
y_s = data_s['Survived'].as_matrix()
(x_s, x_val, y_s, y_val) = train_test_split(x_s, y_s, test_size=0, random_state=666)

# dataset = Dataset(x_s, y_s, x_val)
dataset = Dataset(x_s, y_s, x_test)


Method .as_matrix will be removed in a future version. Use .values instead.



In [0]:
rf_params = {
    'n_jobs': -1,
    'n_estimators': 1000,
    'warm_start': True,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0,
    'random_state':10
}

#Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':1000,
    'max_depth': 9,
    'min_samples_split': 6,
    'min_samples_leaf': 4,
    'verbose': 0,
    'random_state':10
}

#AdaBoost parameters
ada_params = {
    'n_estimators': 1000,
    'learning_rate' : 0.75,
    'random_state':10
}

#Gradient Boosting parameters
gb_params = {
    'n_estimators': 1000,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0,
    'random_state':10
}

#Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025,
    'random_state':10
    }
    
#Perceptron Parameters
per_params = {
    'n_iter':50,
    'n_jobs':-1,
    'random_state':10
    }
    
#XGBoost Parameters
xgb_params = {
    'n_estimators':2000,
    'learning_rate':0.1,
    'max_depth':3,
    'min_child_weight':2, 
    'gamma':0.2,
    'subsample':0.85,
    'colsample_bytree':0.5,
    'objective':'binary:logistic',
    'scale_pos_weight':1,
    'reg_alpha':1.05,
    'seed':10
}

In [0]:
# アンサンブルに使うモデルの定義
models = [
    Regressor(dataset=dataset, estimator=RandomForestClassifier, parameters=rf_params, name='rf'),
#     Regressor(dataset=dataset, estimator=KNeighborsClassifier, name='kn'),
    Regressor(dataset=dataset, estimator=ExtraTreesClassifier, parameters=et_params, name='et'),
    Regressor(dataset=dataset, estimator=AdaBoostClassifier, parameters=ada_params, name='ada'),
    Regressor(dataset=dataset, estimator=SVC, parameters=svc_params, name='svc'),
    Regressor(dataset=dataset, estimator=Perceptron, parameters=per_params, name='per'),
    Regressor(dataset=dataset, estimator=XGBClassifier, parameters=xgb_params, name='xgb'),
#     Regressor(dataset=dataset, estimator=GradientBoostingClassifier, parameters=gb_params, name='gb')
]

In [257]:
# pipelineを定義、2nd levelデータセットの作成
pipeline = ModelsPipeline(*models)
stack_ds = pipeline.stack(k=10, seed=0)


n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.


n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.


n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.


n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.


n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.


n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.


n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.


n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.


n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.


n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.



In [0]:
# modelを作ってvalidation
stacker = Regressor(dataset=stack_ds, estimator=ExtraTreesClassifier, parameters={
    'n_estimators':2000, 
    'max_depth':4, 
    'min_samples_split':6, 
    'min_samples_leaf':2, 
    'max_features':0.8, 
    'n_jobs':-1, 
    'random_state':10, 
    'verbose':0
})
y_trues, y_preds = stacker.validate(k=10)

In [262]:
for i in range(len(y_trues)):
  print("Accuracy_v:",accuracy_score(y_trues[i], np.round(y_preds[i])))

Accuracy_v: 0.8777777777777778
Accuracy_v: 0.797752808988764
Accuracy_v: 0.8426966292134831
Accuracy_v: 0.7640449438202247
Accuracy_v: 0.8539325842696629
Accuracy_v: 0.8426966292134831
Accuracy_v: 0.8314606741573034
Accuracy_v: 0.8314606741573034
Accuracy_v: 0.7752808988764045
Accuracy_v: 0.8314606741573034


In [0]:
pred_stacker = stacker.predict()

In [0]:
pred_stacker = np.round(pred_stacker).astype(int)

In [0]:
'''
評価
'''
print("Accuracy_v:",accuracy_score(pred_stacker, y_val))

In [0]:
'''
提出
'''
from datetime import datetime, timedelta, timezone
JST = timezone(timedelta(hours=+9), 'JST')
ts = datetime.now(JST).strftime('%y%m%d%H%M')

y_test = pred_stacker

test["Survived"] = y_test.astype(np.int)
test[["PassengerId","Survived"]].to_csv(('submit_'+ts+'_stacker.csv'),index=False)