In [1]:
import pandas as pd
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier

In [2]:
#訓練データの読み込み
train_df = pd.read_csv("train.csv",header=0)

In [3]:
#Sexをダミー変数に変換
train_df["Gender"] = train_df["Sex"].map({"female":0,"male":1}).astype(int)

In [4]:
#train_dfの最初3つのデータを表示
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0


In [5]:
#train_df.Ageには欠損値があるので補間する必要がある
train_df.Age[15:20]

15    55.0
16     2.0
17     NaN
18    31.0
19     NaN
Name: Age, dtype: float64

In [6]:
#trian_df.Age.isnull()はnullならばTrueとするリストを返す
train_df.Age.isnull()[15:20]

15    False
16    False
17     True
18    False
19     True
Name: Age, dtype: bool

In [7]:
#欠損しているデータをNaNで表示
train_df.Age[train_df.Age.isnull()][:5]

5    NaN
17   NaN
19   NaN
26   NaN
28   NaN
Name: Age, dtype: float64

In [8]:
#年齢の欠損値は年齢の平均値で補完する
median_age = train_df["Age"].dropna().median()#欠損していないデータの平均値
if len(train_df.Age[train_df.Age.isnull()])>0:#もし欠損しているデータがあれば
    train_df.loc[(train_df.Age.isnull()),"Age"] = median_age

In [9]:
#学習に必要無い列を削除する
train_df = train_df.drop(["Name","Ticket","Sex","SibSp","Parch","Fare","Cabin","Embarked","PassengerId"],axis=1)

In [10]:
train_df.head(3)

Unnamed: 0,Survived,Pclass,Age,Gender
0,0,3,22.0,1
1,1,1,38.0,0
2,1,3,26.0,0


In [11]:
#テストデータの読み込み, Sexをダミー変数に変換
test_df = pd.read_csv("test.csv",header=0)
test_df["Gender"] = test_df["Sex"].map({"female":0,"male":1}).astype(int)

In [12]:
#年齢の欠損値は年齢の平均値で補間する
median_age = test_df["Age"].dropna().median()
if len(test_df.Age[test_df.Age.isnull()])>0:
    test_df.loc[(test_df.Age.isnull()),"Age"] = median_age

In [13]:
#テストデータのPassengerId列を退避させ, テストデータの不要な列を削除する
ids = test_df["PassengerId"].values
test_df = test_df.drop(["Name","Ticket","Sex","SibSp","Parch","Fare","Cabin","Embarked","PassengerId"],axis=1)

In [14]:
test_df.head(3)

Unnamed: 0,Pclass,Age,Gender
0,3,34.5,1
1,3,47.0,0
2,2,62.0,1


In [15]:
train_df.values

array([[  0.,   3.,  22.,   1.],
       [  1.,   1.,  38.,   0.],
       [  1.,   3.,  26.,   0.],
       ..., 
       [  0.,   3.,  28.,   0.],
       [  1.,   1.,  26.,   1.],
       [  0.,   3.,  32.,   1.]])

In [16]:
#ランダムフォレストで予測
train_data = train_df.values
test_data = test_df.values

In [17]:
train_data[0::,1::]

array([[  3.,  22.,   1.],
       [  1.,  38.,   0.],
       [  3.,  26.,   0.],
       ..., 
       [  3.,  28.,   0.],
       [  1.,  26.,   1.],
       [  3.,  32.,   1.]])

In [18]:
train_data[0::,0]

array([ 0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,
        0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,
        0.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,
        1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        1.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [19]:
#学習
model = RandomForestClassifier(n_estimators=100)
output = model.fit(train_data[0::,1::],train_data[0::,0]).predict(test_data).astype(int)

In [20]:
#結果を"titanic_submit.csvとして出力
submit_file = open("titanic_submit.csv","w")
file_object = csv.writer(submit_file)
file_object.writerow([bytes("PassengerId",'UTF-8'),bytes("Survived",'UTF-8')])
file_object.writerows(zip(ids,output))
submit_file.close()

In [22]:
submit = pd.read_csv("titanic_submit.csv",header=0)

In [23]:
submit.head(3)

Unnamed: 0,b'PassengerId',b'Survived'
0,892,0
1,893,0
2,894,1
