In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [3]:
# 学習データの読み込み
train_df=pd.read_csv("/Users/tasuke0630/making/DataScience/kaggle/Taitanic/train.csv")

# テストデータの読み込み
test_df=pd.read_csv("/Users/tasuke0630/making/DataScience/kaggle/Taitanic/test.csv")

# 学習データとテストデータを連結する
df=pd.concat([train_df,test_df],ignore_index=True)
#nullのCSV化
df_null=df.isnull().sum()
df_null.to_csv("null.csv")

# 元データをコピー
df2=df.copy()

In [4]:
#null_csvを見ながら,nullのデータの確認
#sns.countplot("Survived",data=df2)
#sns.countplot("Age",data=df2)
#sns.countplot("Fare",data=df2)
#sns.countplot("Cabin",data=df2)
#sns.countplot("Embarked",data=df2)

In [5]:
print(df2.Cabin.mode())
print(df2.Fare.mode())

Age_mdeian=df2.Age.median()
df2.Age=df2.Age.fillna(Age_mdeian)

0    C23 C25 C27
dtype: object
0    8.05
dtype: float64


In [6]:
# 欠損値の補完
df2.Fare=df2.Fare.fillna(8.05)
df2.Cabin=df2.Cabin.fillna("C23")
df2.Embarked=df2.Embarked.fillna("S")

In [7]:
df2.isnull().sum()

PassengerId      0
Survived       418
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [8]:
#使わないカラムを削除する
df3=df2.drop(columns=["Name","Ticket"])
df3.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,655.0,0.383838,2.294882,29.503186,0.498854,0.385027,33.276193
std,378.020061,0.486592,0.837836,12.905241,1.041658,0.86556,51.743584
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,22.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,35.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [9]:
#カテゴリカル変数の数値変換
tmp_Sex=pd.get_dummies(df3["Sex"],prefix="Sex")
df3=pd.concat([df3,tmp_Sex],axis=1).drop(columns="Sex")

tmp_Cabin=pd.get_dummies(df3["Cabin"],prefix="Cabin")
df3=pd.concat([df3,tmp_Cabin],axis=1).drop(columns="Cabin")

tmp_Embarked=pd.get_dummies(df3["Embarked"],prefix="Embarked")
df3=pd.concat([df3,tmp_Embarked],axis=1).drop(columns="Embarked")
df3.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Cabin_A10,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
497,498,0.0,3,28.0,0,0,15.1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
633,634,0.0,1,28.0,0,0,0.0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
517,518,0.0,3,28.0,0,0,24.15,0,1,0,...,0,0,0,0,0,0,0,0,1,0
441,442,0.0,3,20.0,0,0,9.5,0,1,0,...,0,0,0,0,0,0,0,0,0,1
965,966,,1,35.0,0,0,211.5,1,0,0,...,0,0,0,0,0,0,0,1,0,0
168,169,0.0,1,28.0,0,0,25.925,0,1,0,...,0,0,0,0,0,0,0,0,0,1
884,885,0.0,3,25.0,0,0,7.05,0,1,0,...,0,0,0,0,0,0,0,0,0,1
412,413,1.0,1,33.0,1,0,90.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1187,1188,,2,1.0,1,2,41.5792,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1288,1289,,1,48.0,1,1,79.2,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
# 学習データに分割した結果を変数trainに格納する
train=df3[~df3.Survived.isnull()]

# テストデータに分割した結果を変数trainに格納する
test=df3[df3.Survived.isnull()]

#trainのデータ
train_x=train.drop(columns=["Survived","PassengerId"])
train_y=train["Survived"]

In [12]:
clf=RandomForestClassifier(random_state=0,
                                       n_estimators=300,
                                       criterion="gini",
                                       max_depth=6,
                                       oob_score=True,
                                       warm_start=False,
                                      class_weight=None)

In [13]:
clf.fit(train_x,train_y)

y_pred_train=clf.predict(train_x)
print(classification_report(train_y,y_pred_train))

# 学習結果の検証
X_test=test.drop(columns=["Survived","PassengerId"])
y_predict=clf.predict(X_test)

print("Model score: %.3f"% clf.score(X_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.83      0.95      0.88       549
         1.0       0.89      0.69      0.77       342

    accuracy                           0.85       891
   macro avg       0.86      0.82      0.83       891
weighted avg       0.85      0.85      0.84       891

Model score: 1.000


In [14]:
#type(y_predict)
result=pd.DataFrame(y_predict)

#列名に名前をつける
result2=result.rename(columns={result.columns[0]: "Survived"})

#PassngerIdを合わせる
l=[]
l2=[]
t_id=test_df.loc[:,"PassengerId"]
for i in range(len(t_id)):
               tmp={
                   "PassengerId":t_id[i]
               }
               l.append(tmp);
               
l2=pd.DataFrame(l);
result3=pd.concat([l2,result2],axis=1)


#予測結果を整数に変換する
result3.Survived=result3.Survived.astype(int)
result3.PassengerId=result3.PassengerId.astype(int)

In [15]:
# CSVの作成
result3.to_csv("predict5.csv",index=False)