# タイタニックの乗客の生存を予想

## import

In [2]:
import pandas as pd

## データの取得

In [3]:
df_train = pd.read_csv("train.csv")
df_test    = pd.read_csv("test.csv")

In [4]:
print(type(df_train))
print(df_train.shape) #891人分のデータで12変数
print(df_test.shape) #891人文のデータで11変数(生存の情報がない)
df_train.head()

<class 'pandas.core.frame.DataFrame'>
(891, 12)
(418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 訓練データを整形
drop : 選択したカラムを削除 
axis = 1で列方向に削除

iloc : データフレームから抜き出す
型はデータフレームからseriesになる

In [5]:
#使わない変数をなくす
df_train = df_train.drop(['PassengerId','Name','Ticket', 'Cabin', 'Embarked'], axis=1)

In [6]:
#説明変数
X_train = df_train.iloc[:, 1:7]

#目的変数
Y_train = df_train.iloc[:, 0]

In [7]:
#訓練データにだけ、2つの欠損値を最頻出のSで埋める(Emberkedで欠損)
#X_train["Embarked"] = X_train["Embarked"].fillna("S")

#年齢の欠損値を中央値で置き換える
X_train["Age"].fillna(X_train.Age.median(), inplace=True)

In [8]:
#maleとfemaleのデータを0,1に変える
X_train = X_train.replace("male",  0)
X_train = X_train.replace("female", 1)

In [66]:
print(type(X_train))
print(type(Y_train))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


## 訓練データを開発8割、評価2割で分ける
X_dev : 開発用の訓練データの説明変数(712, 8)
Y_dev : 開発用の訓練データの目的変数(712,1)

X_val :  評価用の訓練データの説明変数(179, 8)
Y_val :  評価用の訓練データの目的変数(179,1)

In [9]:
from sklearn.model_selection import train_test_split

X_dev, X_val, Y_dev, Y_val = train_test_split(X_train, Y_train, train_size=0.8, random_state=0)

## データの標準化

In [11]:
from sklearn.preprocessing import StandardScaler

#開発用
scaler = StandardScaler()#変換器の初期化
scaler.fit(X_dev)#開発データに合わせる,ないとエラー
X_dev_scaled = scaler.transform(X_dev)#標準化されたデータが返される

#評価用
scaler.fit(X_val)#開発データに合わせる,ないとエラー
X_val_scaled = scaler.transform(X_val)#標準化されたデータが返される

# ロジスティック回帰

In [12]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(penalty='l2', C=0.1)
classifier.fit(X_dev_scaled, Y_dev)

Y_val_pred = classifier.predict(X_val_scaled)

## マクロ平均AUCで評価

In [13]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_val, Y_val_pred, average='macro')

0.78418972332015813

## パイプライン
パイプラインの名前の部分('clf'とか)は
交差検証するときのパラメータ選択のとこで使う

In [14]:
from sklearn.pipeline import Pipeline

steps = [('scaler', StandardScaler()), ('clf', LogisticRegression(penalty='l2'))]#ロジクティック回帰

pipeline = Pipeline(steps)#まとめた

## 交差検証
GridSearchCV(識別器, 最適化したいハイパーパラメータ, 交差検証の回数, 評価関数)
https://qiita.com/SE96UoC5AfUt7uY/items/c81f7cea72a44a7bfd3a
評価法
https://qiita.com/nazoking@github/items/958426da6448d74279c7
make_scorer関数
make_scorer(スコア関数(or損失関数),  **kwargs)

accuracyはスコアではなく評価指標なので、これを交差検証のすこか関数としては使えない

In [15]:
from sklearn.model_selection import GridSearchCV#交差検証によりハイパーパラメータの選択できる
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

params = {'clf__C': [0.01, 0.1, 1.0, 10, 100]}#正則化パラメータの選択
scorer = make_scorer(roc_auc_score, average='macro')#, needs_proba=True)

predictor = GridSearchCV(pipeline, params, cv=5, scoring = scorer)#cvで交差検証数

In [16]:
#モデルをfit
predictor.fit(X_dev, Y_dev)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'clf__C': [0.01, 0.1, 1.0, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(roc_auc_score, average=macro), verbose=0)

In [17]:
#評価
Y_val_pred = predictor.predict(X_val)
print(roc_auc_score(Y_val, Y_val_pred, average='macro')) #性能評価
print(accuracy_score(Y_val, Y_val_pred))

0.79953886693
0.810055865922


# SVM

## pipeline

In [19]:
from sklearn import svm

estimators = [('scaler', StandardScaler()), ('svm', svm.SVC())]

pl = Pipeline(estimators)

## 交差検証
tolist : 多次元配列でもすべてリストへ変換されます
svmのパラメータ
kernel : ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callableのどれか
C : 誤差項のペラルティパラメータ
gamma : ‘rbf’, ‘poly’ and ‘sigmoid’のパラメータ

In [20]:
import numpy as np

parameters = {"svm__kernel" : ["linear", "poly", "rbf", "sigmoid"],
              'svm__C': [0.01, 0.1, 1],         #np.logspace(0, 2, 10).tolist(),
              "svm__gamma": [0.01, 0.1, 1]}          #np.logspace(-3, 0, 10).tolist()}

In [21]:
clf = GridSearchCV(pl, parameters, cv=5)
clf.fit(X_dev, Y_dev)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'svm__C': [0.01, 0.1, 1, 10], 'svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'svm__gamma': [0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [22]:
Predict = clf.predict(X_val)
print(roc_auc_score(Y_val, Predict, average='macro')) #性能評価
print(accuracy_score(Y_val, Predict))

0.791436100132
0.810055865922


In [23]:
#全てのパラメータの組み合わせでの結果
#clf.grid_scores_

In [24]:
clf.best_params_

{'svm__C': 1, 'svm__gamma': 0.1, 'svm__kernel': 'rbf'}

# ランダムフォレスト

## pipeline

In [27]:
from sklearn.ensemble import RandomForestClassifier

forest = [('scaler', StandardScaler()), ('forest', RandomForestClassifier())]

pl2 = Pipeline(forest)

## 交差検証

In [56]:
para = {
         'forest__n_estimators'  : [10, 30, 50],
         'forest__min_samples_split' : [3, 5, 10, 15, 20],
         'forest__max_depth' : [3, 10, 20, 30]
}

clf2 = GridSearchCV(pl2, para, cv=5)#, scoring = scorer)#cvで交差検証数

In [57]:
clf2.fit(X_dev, Y_dev)
Pred = clf2.predict(X_val)
#print(roc_auc_score(Y_val, Pred, average='macro')) #性能評価
#print(accuracy_score(Y_val, Pred))

In [60]:
print(clf2.best_params_)
print(roc_auc_score(Y_val, Pred, average='macro')) #性能評価
print(accuracy_score(Y_val, Pred))

{'forest__min_samples_split': 15, 'forest__n_estimators': 50, 'forest__max_depth': 10}
0.808761528327
0.837988826816
