## ◉ モジュール、データ読み込み

In [1]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("../input/titanic/train.csv")
df_test  = pd.read_csv("../input/titanic/test.csv")

In [2]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print(f"len: {len(df_train)}")
print("-" * 20)
print(df_train.dtypes)
print("-" * 20)
print(df_train.isnull().sum())

len: 891
--------------------
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
--------------------
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
print(f"len: {len(df_test)}")
print("-" * 20)
print(df_test.dtypes)
print("-" * 20)
print(df_test.isnull().sum())

len: 418
--------------------
PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
--------------------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


## ◉ データ加工

### ● PassengerId, Ticket, Cabinの削除

In [6]:
df_train.drop(columns=["PassengerId", "Ticket", "Cabin"], inplace=True)
df_test.drop( columns=[               "Ticket", "Cabin"], inplace=True) # testのPassengerIdは評価の際に使うため削除しない

In [7]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [8]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S


### ● trainのEmbarkedが欠損値の行を削除、SexとEmbarkedの値を数値に変換

In [9]:
df_train.dropna(subset=["Embarked"], inplace=True)

print(len(df_train))

889


In [10]:
list_df = [df_train, df_test]

In [11]:
for df in list_df:
    df["Sex"].replace({"male": 0, "female": 1}, inplace=True)
    df["Embarked"].replace({"S": 0, "C": 1, "Q": 2}, inplace=True)

In [12]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0


In [13]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,7.8292,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,7.0,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,9.6875,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,8.6625,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,12.2875,0


In [14]:
print(df_train.dtypes)
print("-" * 20)
print(df_test.dtypes)

Survived      int64
Pclass        int64
Name         object
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked      int64
dtype: object
--------------------
PassengerId      int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked         int64
dtype: object


### ● SibSp, Parchをもとに新しい列、Familyを作成
SibSpは兄弟、姉妹、夫、妻を、Parchは母、父、娘、息子の人数をそれぞれ表しているため、これらの値の合計値をFamilyとする。  
Family作成後、SibSpとParchは削除する。

In [15]:
for df in list_df:
    df["Family"] = df["SibSp"] + df["Parch"]

In [16]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Family
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1,1
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0,1
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0,0


In [17]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Family
0,892,3,"Kelly, Mr. James",0,34.5,0,0,7.8292,2,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,7.0,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,9.6875,2,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,8.6625,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,12.2875,0,2


In [18]:
for df in list_df:
    df.drop(columns=["SibSp", "Parch"], inplace=True)

In [19]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,Family
0,0,3,"Braund, Mr. Owen Harris",0,22.0,7.25,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,71.2833,1,1
2,1,3,"Heikkinen, Miss. Laina",1,26.0,7.925,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,53.1,0,1
4,0,3,"Allen, Mr. William Henry",0,35.0,8.05,0,0


In [20]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Fare,Embarked,Family
0,892,3,"Kelly, Mr. James",0,34.5,7.8292,2,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,7.0,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,9.6875,2,0
3,895,3,"Wirz, Mr. Albert",0,27.0,8.6625,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,12.2875,0,2


In [21]:
for df in list_df:
    print(df["Family"].dtypes)

int64
int64


### ● Nameから情報を抽出した新しい列Titleを追加
Nameには敬称("Mr."など.で終わる単語)が含まれているため、これを取り出した列としてTitleを作成する。  
各敬称をグループ分けし、その各グループを数値で管理する。  
上記処理の終了後、Nameは削除する。

In [22]:
# 敬称を抽出
for df in list_df:
    df["Title"] = df.Name.str.extract("([A-Za-z]+)\.", expand=False)

In [23]:
# 各敬称のと性別のクロス集計
pd.crosstab(df_train["Title"], df_train["Sex"])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,1,0
Col,2,0
Countess,0,1
Don,1,0
Dr,6,1
Jonkheer,1,0
Lady,0,1
Major,2,0
Master,40,0
Miss,0,181


In [24]:
pd.crosstab(df_test["Title"], df_test["Sex"])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Col,2,0
Dona,0,1
Dr,1,0
Master,21,0
Miss,0,78
Mr,240,0
Mrs,0,72
Ms,0,1
Rev,2,0


In [25]:
# Master, Miss, Mr, Mrsに置き換えられるのもは置換、それ以外はRareに置換する。
for df in list_df:
    df_train["Title"].replace(["Capt", "Col", "Don", "Dr", "Jonkheer", "Lady", "Major", "Rev", "Sir", "Dona"], "Rare", inplace=True)
    df_train["Title"].replace(["Countess", "Mme"], "Mrs", inplace=True)
    df_train["Title"].replace(["Mlle", "Ms"], "Miss", inplace=True)

In [26]:
# 置換後のクロス集計
pd.crosstab(df_train["Title"], df_train["Sex"])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,40,0
Miss,0,184
Mr,517,0
Mrs,0,126
Rare,20,2


In [27]:
pd.crosstab(df_test["Title"], df_test["Sex"])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Col,2,0
Dona,0,1
Dr,1,0
Master,21,0
Miss,0,78
Mr,240,0
Mrs,0,72
Ms,0,1
Rev,2,0


In [28]:
# 各敬称を数値に変換
for df in list_df:
    df.replace({"Master": 0, "Miss": 1, "Mr": 2, "Mrs": 3, "Rare": 4}, inplace=True)

In [29]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,Family,Title
0,0,3,"Braund, Mr. Owen Harris",0,22.0,7.25,0,1,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,71.2833,1,1,3
2,1,3,"Heikkinen, Miss. Laina",1,26.0,7.925,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,53.1,0,1,3
4,0,3,"Allen, Mr. William Henry",0,35.0,8.05,0,0,2


In [30]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Fare,Embarked,Family,Title
0,892,3,"Kelly, Mr. James",0,34.5,7.8292,2,0,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,7.0,0,1,3
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,9.6875,2,0,2
3,895,3,"Wirz, Mr. Albert",0,27.0,8.6625,0,0,2
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,12.2875,0,2,3


In [31]:
# Nameを削除
for df in list_df:
    df.drop(columns="Name", inplace=True)

In [32]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Family,Title
0,0,3,0,22.0,7.25,0,1,2
1,1,1,1,38.0,71.2833,1,1,3
2,1,3,1,26.0,7.925,0,0,1
3,1,1,1,35.0,53.1,0,1,3
4,0,3,0,35.0,8.05,0,0,2


In [33]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Family,Title
0,892,3,0,34.5,7.8292,2,0,2
1,893,3,1,47.0,7.0,0,1,3
2,894,2,0,62.0,9.6875,2,0,2
3,895,3,0,27.0,8.6625,0,0,2
4,896,3,1,22.0,12.2875,0,2,3


### ● Ageの欠損値を補完

In [34]:
guess_ages = np.zeros((2,3))

for df in list_df:
    for i in range(2):
        for j in range(3):
            df_guess = df[(df["Sex"] == i) & (df["Pclass"] == j + 1)]["Age"].dropna()
            age_guess = df_guess.median()
            guess_ages[i, j] = int(age_guess / 0.5 + 0.5 ) * 0.5

    for i in range(2):
        for j in range(3):
            df.loc[(df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j + 1), 'Age'] = guess_ages[i, j]

    df["Age"] = df["Age"].astype(int)

In [35]:
df_train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
Embarked    0
Family      0
Title       0
dtype: int64

In [36]:
df_test.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
Fare           1
Embarked       0
Family         0
Title          0
dtype: int64

### ● Ageを連続値ではなく、年齢帯を表す離散値に変換

In [37]:
for df in list_df:    
    df.loc[df["Age"] <= 16, "Age"] = 0
    df.loc[(df["Age"] > 16) & (df["Age"] <= 32), "Age"] = 1
    df.loc[(df["Age"] > 32) & (df["Age"] <= 48), "Age"] = 2
    df.loc[(df["Age"] > 48) & (df["Age"] <= 64), "Age"] = 3
    df.loc[df["Age"] > 64, "Age"]

In [38]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Family,Title
0,0,3,0,1,7.25,0,1,2
1,1,1,1,2,71.2833,1,1,3
2,1,3,1,1,7.925,0,0,1
3,1,1,1,2,53.1,0,1,3
4,0,3,0,2,8.05,0,0,2


In [39]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Family,Title
0,892,3,0,2,7.8292,2,0,2
1,893,3,1,2,7.0,0,1,3
2,894,2,0,3,9.6875,2,0,2
3,895,3,0,1,8.6625,0,0,2
4,896,3,1,1,12.2875,0,2,3


### ● Fareの欠損値を補完

In [40]:
df_test["Fare"].fillna(df_test["Fare"].dropna().median(), inplace=True)

In [41]:
df_test.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
Fare           0
Embarked       0
Family         0
Title          0
dtype: int64

### ● Fareを連続値ではなく、金額帯を表す離散値に変換

In [42]:
for df in list_df:
    df.loc[df["Fare"] <= 7.91, "Fare"] = 0
    df.loc[(df["Fare"] > 7.91) & (df["Fare"] <= 14.454), "Fare"] = 1
    df.loc[(df["Fare"] > 14.454) & (df["Fare"] <= 31), "Fare"]   = 2
    df.loc[df["Fare"] > 31, "Fare"] = 3
    df["Fare"] = df["Fare"].astype(int)

In [43]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Family,Title
0,0,3,0,1,0,0,1,2
1,1,1,1,2,3,1,1,3
2,1,3,1,1,1,0,0,1
3,1,1,1,2,3,0,1,3
4,0,3,0,2,1,0,0,2


In [44]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Family,Title
0,892,3,0,2,0,2,0,2
1,893,3,1,2,0,0,1,3
2,894,2,0,3,1,2,0,2
3,895,3,0,1,1,0,0,2
4,896,3,1,1,1,0,2,3


## ◉ 学習