## ◉ モジュール、データ読み込み

In [1]:
import pandas as pd

df_train = pd.read_csv("../input/titanic/train.csv")
df_test  = pd.read_csv("../input/titanic/test.csv")

In [2]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print(f"len: {len(df_train)}")
print("-" * 20)
print(df_train.dtypes)
print("-" * 20)
print(df_train.isnull().sum())

len: 891
--------------------
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
--------------------
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
print(f"len: {len(df_test)}")
print("-" * 20)
print(df_test.dtypes)
print("-" * 20)
print(df_test.isnull().sum())

len: 418
--------------------
PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
--------------------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


## ◉ データ加工

### ● PassengerId, Ticket, Cabinの削除

In [6]:
df_train.drop(columns=["PassengerId", "Ticket", "Cabin"], inplace=True)
df_test.drop( columns=[               "Ticket", "Cabin"], inplace=True) # testのPassengerIdは評価の際に使うため削除しない

In [7]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [8]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S


### ● trainのEmbarkedが欠損値の行を削除、SexとEmbarkedの値を数値に変換

In [9]:
df_train.dropna(subset=["Embarked"], inplace=True)

print(len(df_train))

889


In [10]:
df_train["Sex"].replace({"male": 0, "female": 1}, inplace=True)
df_test["Sex"].replace( {"male": 0, "female": 1}, inplace=True)

In [11]:
df_train["Embarked"].replace({"S": 0, "C": 1, "Q": 2}, inplace=True)
df_test["Embarked"].replace( {"S": 0, "C": 1, "Q": 2}, inplace=True)

In [12]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0


In [13]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,7.8292,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,7.0,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,9.6875,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,8.6625,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,12.2875,0


In [14]:
print(df_train.dtypes)
print("-" * 20)
print(df_test.dtypes)

Survived      int64
Pclass        int64
Name         object
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked      int64
dtype: object
--------------------
PassengerId      int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked         int64
dtype: object


### ● SibSp, Parchをもとに新しい列、Familyを作成
SibSpは兄弟、姉妹、夫、妻を、Parchは母、父、娘、息子の人数をそれぞれ表しているため、これらの値の合計値をFamilyとする。  
Family作成後、SibSpとParchは削除する。

In [15]:
df_train["Family"] = df_train["SibSp"] + df_train["Parch"]
df_test["Family"]  = df_test["SibSp"]  + df_test["Parch"]

In [16]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Family
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1,1
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0,1
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0,0


In [17]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Family
0,892,3,"Kelly, Mr. James",0,34.5,0,0,7.8292,2,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,7.0,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,9.6875,2,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,8.6625,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,12.2875,0,2


In [18]:
df_train.drop(columns=["SibSp", "Parch"], inplace=True)
df_test.drop( columns=["SibSp", "Parch"], inplace=True)

In [19]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,Family
0,0,3,"Braund, Mr. Owen Harris",0,22.0,7.25,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,71.2833,1,1
2,1,3,"Heikkinen, Miss. Laina",1,26.0,7.925,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,53.1,0,1
4,0,3,"Allen, Mr. William Henry",0,35.0,8.05,0,0


In [20]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Fare,Embarked,Family
0,892,3,"Kelly, Mr. James",0,34.5,7.8292,2,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,7.0,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,9.6875,2,0
3,895,3,"Wirz, Mr. Albert",0,27.0,8.6625,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,12.2875,0,2


In [21]:
print(df_train["Family"].dtypes)
print(df_test["Family"].dtypes)

int64
int64


### ● Nameから情報を抽出した新しい列Titleを追加
Nameには敬称("Mr."など.で終わる単語)が含まれているため、これを取り出した列としてTitleを作成する。  
各敬称をグループ分けし、その各グループを数値で管理する。  
上記処理の終了後、Nameは削除する。