In [22]:
# ここから実際の予測を行っていく

# 使用するライブラリとデータを読み込む
import pandas as pd 
import numpy as np 
 
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns 
 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
 
# from dtreeviz.trees import dtreeviz

train = pd.read_csv("./csv_data/train.csv")
test = pd.read_csv("./csv_data/test.csv")

<h2>欠損値処理をおこなう</h2>

In [23]:
# データ前処理をおこなう
train = train.drop(["PassengerId","Name","Ticket","Cabin"],axis=1)
test = test.drop(["PassengerId","Name","Ticket","Cabin"],axis=1)

In [24]:
# 欠損値処理
print(train.isnull().sum())
print("---------")
print(test.isnull().sum())

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64
---------
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [25]:
train["Embarked"]

0        S
1        C
2        S
3        S
4        S
5        Q
6        S
7        S
8        S
9        C
10       S
11       S
12       S
13       S
14       S
15       S
16       Q
17       S
18       S
19       C
20       S
21       S
22       Q
23       S
24       S
25       S
26       C
27       S
28       Q
29       S
30       C
31       C
32       Q
33       S
34       C
35       S
36       C
37       S
38       S
39       C
40       S
41       S
42       C
43       C
44       Q
45       S
46       Q
47       Q
48       C
49       S
50       S
51       S
52       C
53       S
54       C
55       S
56       S
57       C
58       S
59       S
60       C
61     NaN
62       S
63       S
64       C
65       C
66       S
67       S
68       S
69       S
70       S
71       S
72       S
73       C
74       S
75       S
76       S
77       S
78       S
79       S
80       S
81       S
82       Q
83       S
84       S
85       S
86       S
87       S
88       S
89       S
90       S

In [26]:
# train の Embarked の欠損処理
#埋める前ではS,C,Q,nanの4種類あることを確認
train["Embarked"].unique()  #embarkedの中にある４種類を出す

array(['S', 'C', 'Q', nan], dtype=object)

In [27]:
#Embarkedの欠損を最頻値で埋める
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode().iloc[0]) 

#埋めた後ではS,C,Qの3種類あることを確認
train["Embarked"].unique()

array(['S', 'C', 'Q'], dtype=object)

In [28]:
#Ageの欠損を中央値で埋める
train["Age"] = train["Age"].fillna(train["Age"].median())
test["Age"] = test["Age"].fillna(test["Age"].median())

#欠損値がなくなったか確認
print(train["Age"].isnull().sum())
print(test["Age"].isnull().sum())

0
0


In [29]:
#Fareを中央値で埋める
test["Fare"] = test["Fare"].fillna(test["Fare"].median()) 

#欠損がなくなったか確認
test["Fare"].isnull().sum()

0

In [30]:
#データ型の確認
train.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

<h2>ダミー変数をつかい置き換える</h2>

In [31]:
#maleを0に、femaleを1に変換
train["Sex"] = train["Sex"].map({"male":0,"female":1})
test["Sex"] = test["Sex"].map({"male":0,"female":1})

In [32]:
#Embarkedをダミー変数化
train = pd.get_dummies(train,columns=["Embarked"])  
#ダミー変数 １ －１ 、１ ０  、 embarked は三種類 
test = pd.get_dummies(test,columns=["Embarked"])  

In [33]:
# データを整理した前の確認
#以下のコード方法でデータフレームを全て表示
pd.set_option('display.max_rows', None)
train 

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,0,0,1
1,1,1,1,38.0,1,0,71.2833,1,0,0
2,1,3,1,26.0,0,0,7.925,0,0,1
3,1,1,1,35.0,1,0,53.1,0,0,1
4,0,3,0,35.0,0,0,8.05,0,0,1
5,0,3,0,28.0,0,0,8.4583,0,1,0
6,0,1,0,54.0,0,0,51.8625,0,0,1
7,0,3,0,2.0,3,1,21.075,0,0,1
8,1,3,1,27.0,0,2,11.1333,0,0,1
9,1,2,1,14.0,1,0,30.0708,1,0,0


In [35]:
# データを整理した後の確認
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,0,34.5,0,0,7.8292,0,1,0
1,3,1,47.0,1,0,7.0,0,0,1
2,2,0,62.0,0,0,9.6875,0,1,0
3,3,0,27.0,0,0,8.6625,0,0,1
4,3,1,22.0,1,1,12.2875,0,0,1
5,3,0,14.0,0,0,9.225,0,0,1
6,3,1,30.0,0,0,7.6292,0,1,0
7,2,0,26.0,1,1,29.0,0,0,1
8,3,1,18.0,0,0,7.2292,1,0,0
9,3,0,21.0,2,0,24.15,0,0,1
