# Kaggle-Titanic


### データの読み込み

In [1]:
import numpy as np
import pandas as pd


In [2]:
def load_csv():
    train = pd.read_csv("datasets/train.csv")
    test1 = pd.read_csv("datasets/test.csv")
    test2 = pd.read_csv("datasets/gender_submission.csv")

    Y_train = train['Survived']
    Y_test = test2['Survived']

    PassengerId = np.array(test1["PassengerId"]).astype(int)

    return train, Y_train, test1, Y_test, PassengerId

In [3]:
X_train, Y_train, X_test, Y_test, PassengerId = load_csv()
print("Train Size: {}".format(X_train.size))
print("Test Size: {}".format(X_test.size))
print("Train Shape: {}".format(np.shape(X_train)))
X_train.head(10)

Train Size: 10692
Test Size: 4598
Train Shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### 欠測値の確認

In [4]:
X_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
X_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

* これを見ると、"Cabin"はデータセット中のほとんどが欠測値で、予測には使えなさそう。

* "Age"は生存率の予測に重要なので欠測値を補完して使う。



## 特徴量選択

- 予測には、"Pclass", "Sex", "Age", "Fare"と新たに生成する"Family_Size"(同乗した家族の人数)を用いる
    - 家族は同じボートに乗ろうとするため、大家族は不利であった。
- "Age"の欠測値の予測には"Name"特徴量の敬称("Mr","Miss","Mars","Master")を用いる



### 欠測値の補完

"Age"の欠測部分の補完を行う

"Name"の敬称を取り出す

In [6]:
datasets = [X_train, X_test]

for data in datasets:
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [7]:
X_train['Title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Col           2
Major         2
Jonkheer      1
Mme           1
Sir           1
Capt          1
Lady          1
Countess      1
Don           1
Ms            1
Name: Title, dtype: int64

In [8]:
X_test['Title'].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: Title, dtype: int64

やはり敬称(特に"Mr","Miss","Mrs","Master")が使えそう
敬称に数値をマッピングする

In [9]:
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 5, "Rev": 5, "Col": 4, "Major": 4, "Mlle": 4,"Countess": 4,
                 "Ms": 4, "Lady": 4, "Jonkheer": 4, "Don": 4, "Dona": 4, "Mme": 4, "Capt": 4, "Sir": 4}
for data in datasets:
    data['Title'] = data['Title'].map(title_mapping)

"Age"の欠測部分を同一敬称内の平均値で埋める

In [10]:
X_train["Age"].fillna(X_train.groupby("Title")["Age"].transform("median"), inplace=True)
X_test["Age"].fillna(X_test.groupby("Title")["Age"].transform("median"), inplace=True)

In [11]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [12]:
X_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,2


"Fare"の欠測値をPclassの平均値で埋める

In [13]:
X_train["Fare"].fillna(X_train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
X_test["Fare"].fillna(X_test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

### Family_Sizeの作成
"Family_Size"は，基本的には"Name"特徴量のLast Nameでまとめて考える．"Cabin"や"Embarked"が同じで苗字が同じ場合は家族である可能性が高い。逆に、苗字が同じでも"Cabin"や"Embarked"が違えば、家族でない可能性が高い。

これらを考慮して"Family_Size"を作成


In [14]:
for data in datasets:
    data['LastName'] = data['Name'].str.extract('([A-Za-z]+)\,', expand=False)

In [15]:
X_train['LastName'].value_counts()

Andersson     9
Sage          7
Skoog         6
Goodwin       6
Carter        6
Johnson       6
Panula        6
Rice          5
Lefebre       4
Smith         4
Hart          4
Baclini       4
Harper        4
Ford          4
Kelly         4
Jensen        4
Brown         4
Asplund       4
Harris        4
Williams      4
Gustafsson    4
Palsson       4
Fortune       4
Goldsmith     3
Flynn         3
Laroche       3
Hansen        3
Boulos        3
Hickman       3
Bourke        3
             ..
Foo           1
Heininen      1
Drazenoic     1
Smiljanic     1
Elsbury       1
Woolner       1
Davison       1
Lesurer       1
Reuchlin      1
Leinonen      1
Najib         1
Gaskell       1
Razi          1
Parkes        1
Vestrom       1
Chibnall      1
Bazzani       1
Green         1
Davidson      1
Yousif        1
Connell       1
Stead         1
Uruchurtu     1
Maioni        1
Ayoub         1
Stoytcheff    1
Fischer       1
Lehmann       1
Cumings       1
Shelley       1
Name: LastName, Length: 

In [16]:
for data in datasets:
    data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
X_train['Family_Size'].value_counts()

1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: Family_Size, dtype: int64

In [17]:
X_train["Cabin"].fillna("U", inplace=True)
X_test["Cabin"].fillna("U", inplace=True)
room4 = ["G6", "B96 B98", "C23 C25 C27"]
room3 = ["F33", "C22 C26", "F2", "E101", "D"]
for data in datasets:
    data['Share_Room'] = 0
for i in range(len(X_train)):
    for room in room4:
        if X_train['Cabin'][i] == room:
            X_train['Share_Room'][i] = 2
    for room in room3:
        if X_train['Cabin'][i] == room:
            X_train['Share_Room'][i] = 1
for i in range(len(X_test)):
    for room in room4:
        if X_test['Cabin'][i] == room:
            X_test['Share_Room'][i] = 2
    for room in room3:
        if X_test['Cabin'][i] == room:
            X_test['Share_Room'][i] = 1
X_train['Share_Room'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0    864
1     15
2     12
Name: Share_Room, dtype: int64

## データセット整形
'Sex'を男性(0)、女性(1)にラベル化

使わない特徴量を学習データから削除

In [18]:
sex_mapping = {"male": 0, "female": 1}
for data in datasets:
    data['Sex'] = data['Sex'].map(sex_mapping)

drop_features = ['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch', 'Cabin', 'Embarked', 'Title', 'LastName']
X_train.drop(drop_features, axis=1, inplace=True)
X_train.drop(['Survived'], axis=1, inplace=True)
X_test.drop(drop_features, axis=1, inplace=True)

In [19]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Family_Size,Share_Room
0,3,0,22.0,7.25,2,0
1,1,1,38.0,71.2833,2,0
2,3,1,26.0,7.925,1,0
3,1,1,35.0,53.1,2,0
4,3,0,35.0,8.05,1,0


In [20]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Family_Size,Share_Room
0,3,0,34.5,7.8292,1,0
1,3,1,47.0,7.0,2,0
2,2,0,62.0,9.6875,1,0
3,3,0,27.0,8.6625,1,0
4,3,1,22.0,12.2875,3,0


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

'''プロット'''
plots = plt.plot(X_pca[0], X_pca[1])
plt.grid()
plt.legend()
plt.show()

## 予測

予測を行う
グリッドサーチでロジスティック回帰、ランダムフォレスト、SVCの中からモデル選択

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

def Model(X_train, Y_train, X_test, PassengerId):

    '''グリッドサーチによる最良モデル選択'''
    pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])
    param_grid = [
        {'classifier': [LogisticRegression()], 'preprocessing': [StandardScaler(), None],
         'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
        {'classifier': [SVC()], 'preprocessing': [StandardScaler(), None],
         'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
         'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
        {'classifier': [RandomForestClassifier()],
         'preprocessing': [None], 'classifier__max_features': [1, 2, 3],
         'classifier__n_estimators': [10, 20, 30, 50, 80]},
    ]
    grid = GridSearchCV(pipe, param_grid, cv=5)
    grid.fit(X_train, Y_train)

    print("Best parameters: {}".format(grid.best_params_))
    print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))

    Titanic_Solution = pd.DataFrame(grid.predict(X_test), PassengerId, columns=["Survived"])
    Titanic_Solution.to_csv("Titanic_Solution.csv", index_label=["PassengerId"])

In [22]:
Model(X_train, Y_train, X_test, PassengerId)

Best parameters: {'classifier': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'classifier__C': 10, 'classifier__gamma': 0.1, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}
Best cross-validation accuracy: 0.83


## 今後

- 学習を邪魔している外れ値の様なものがないか調査
- 階層ベイズモデルor混合モデルを適用
- 欠測値を平均値で捕捉するのではなく，これ自体も予測する(kNN等により)