In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from catboost import CatBoostClassifier
import joblib

In [2]:
# 学習データの読み込み
train_df=pd.read_csv("/Users/tasuke0630/making/DataScience/kaggle/Taitanic/train.csv")

# テストデータの読み込み
test_df=pd.read_csv("/Users/tasuke0630/making/DataScience/kaggle/Taitanic/test.csv")

# 学習データとテストデータを連結する
df=pd.concat([train_df,test_df],ignore_index=True)
#nullのCSV化
df_null=df.isnull().sum()
df_null.to_csv("null.csv")

# 元データをコピー
df2=df.copy()

In [3]:
#null_csvを見ながら,nullのデータの確認
#sns.countplot("Survived",data=df2)
#sns.countplot("Age",data=df2)
#sns.countplot("Fare",data=df2)
#sns.countplot("Cabin",data=df2)
#sns.countplot("Embarked",data=df2)

In [4]:
print(df2.Cabin.mode())
print(df2.Fare.mode())

Age_mdeian=df2.Age.median()
df2.Age=df2.Age.fillna(Age_mdeian)

0    C23 C25 C27
dtype: object
0    8.05
dtype: float64


In [5]:
# 欠損値の補完
df2.Fare=df2.Fare.fillna(8.05)
df2.Cabin=df2.Cabin.fillna("C23")
df2.Embarked=df2.Embarked.fillna("S")

In [6]:
df2.isnull().sum()

PassengerId      0
Survived       418
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [7]:
#使わないカラムを削除する
df3=df2.drop(columns=["Name","Ticket"])
df3.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,655.0,0.383838,2.294882,29.503186,0.498854,0.385027,33.276193
std,378.020061,0.486592,0.837836,12.905241,1.041658,0.86556,51.743584
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,22.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,35.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [8]:
#カテゴリカル変数の数値変換
tmp_Sex=pd.get_dummies(df3["Sex"],prefix="Sex")
df3=pd.concat([df3,tmp_Sex],axis=1).drop(columns="Sex")

tmp_Cabin=pd.get_dummies(df3["Cabin"],prefix="Cabin")
df3=pd.concat([df3,tmp_Cabin],axis=1).drop(columns="Cabin")

tmp_Embarked=pd.get_dummies(df3["Embarked"],prefix="Embarked")
df3=pd.concat([df3,tmp_Embarked],axis=1).drop(columns="Embarked")
df3.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Cabin_A10,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
639,640,0.0,3,28.0,1,0,16.1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
465,466,0.0,3,38.0,0,0,7.05,0,1,0,...,0,0,0,0,0,0,0,0,0,1
939,940,,1,60.0,0,0,76.2917,1,0,0,...,0,0,0,0,0,0,0,1,0,0
178,179,0.0,2,30.0,0,0,13.0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1123,1124,,3,21.0,1,0,6.4958,0,1,0,...,0,0,0,0,0,0,0,0,0,1
819,820,0.0,3,10.0,3,2,27.9,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1163,1164,,1,26.0,1,0,136.7792,1,0,0,...,0,0,0,0,0,0,0,1,0,0
150,151,0.0,2,51.0,0,0,12.525,0,1,0,...,0,0,0,0,0,0,0,0,0,1
825,826,0.0,3,28.0,0,0,6.95,0,1,0,...,0,0,0,0,0,0,0,0,1,0
126,127,0.0,3,28.0,0,0,7.75,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
# 学習データに分割した結果を変数trainに格納する
train=df3[~df3.Survived.isnull()]

# テストデータに分割した結果を変数trainに格納する
test=df3[df3.Survived.isnull()]

#trainのデータ
train_x=train.drop(columns=["Survived","PassengerId"])
train_y=train["Survived"]

In [10]:
clf=CatBoostClassifier(loss_function = "Logloss",
                         n_estimators=80,
                         depth=5,
                         learning_rate=0.2)

In [17]:
clf.fit(train_x,train_y)

Y_pred_train=clf.predict(train_x)
print(classification_report(train_y, Y_pred_train))

X_test=test.drop(columns=["Survived","PassengerId"])
y_predict=clf.predict(X_test)

print("Stacking model score: %.4f"% clf.score(X_test,y_predict))

0:	learn: 0.5660227	total: 1.36ms	remaining: 107ms
1:	learn: 0.5132858	total: 2.75ms	remaining: 107ms
2:	learn: 0.4734936	total: 4.49ms	remaining: 115ms
3:	learn: 0.4523602	total: 5.86ms	remaining: 111ms
4:	learn: 0.4379946	total: 7.72ms	remaining: 116ms
5:	learn: 0.4210371	total: 9.12ms	remaining: 112ms
6:	learn: 0.4075378	total: 10.4ms	remaining: 109ms
7:	learn: 0.3976191	total: 11.8ms	remaining: 106ms
8:	learn: 0.3954955	total: 13.2ms	remaining: 104ms
9:	learn: 0.3899812	total: 14.5ms	remaining: 101ms
10:	learn: 0.3890448	total: 15.8ms	remaining: 99ms
11:	learn: 0.3843194	total: 17.2ms	remaining: 97.5ms
12:	learn: 0.3820100	total: 18.6ms	remaining: 96ms
13:	learn: 0.3798198	total: 20ms	remaining: 94.4ms
14:	learn: 0.3775550	total: 23ms	remaining: 99.7ms
15:	learn: 0.3764266	total: 24.9ms	remaining: 99.5ms
16:	learn: 0.3740977	total: 27.3ms	remaining: 101ms
17:	learn: 0.3719582	total: 31.1ms	remaining: 107ms
18:	learn: 0.3699304	total: 33.3ms	remaining: 107ms
19:	learn: 0.3685219	tot

In [18]:
result=pd.DataFrame(y_predict)

#列名に名前をつける
result2=result.rename(columns={result.columns[0]: "Survived"})

#PassngerIdを合わせる
l=[]
l2=[]
t_id=test_df.loc[:,"PassengerId"]
for i in range(len(t_id)):
               tmp={
                   "PassengerId":t_id[i]
               }
               l.append(tmp);
               
l2=pd.DataFrame(l);
result3=pd.concat([l2,result2],axis=1)


#予測結果を整数に変換する
result3.Survived=result3.Survived.astype(int)
result3.PassengerId=result3.PassengerId.astype(int)

In [19]:
# CSVの作成
result3.to_csv("predict7.csv",index=False)