In [1]:
# Titanicコンペ（lightgbm）

In [2]:
# インポート
import pandas as pd
# CSV読み込み
# df1 = pd.read_csv('../input/train.csv')
# df2 = pd.read_csv('../input/test.csv')
df1 = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')
# 欠損値確認
df1.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
# 欠損値確認
df2.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [4]:
# 欠損値レコード表示
df1[df1['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [5]:
df1.loc[df1['PassengerId'].isin([62,830]), 'Embarked']

61     NaN
829    NaN
Name: Embarked, dtype: object

In [6]:
# 欠損値補完
df1.loc[df1['PassengerId'].isin([62,830]), 'Embarked'] = 'C'

In [7]:
# 確認
df1.loc[df1['PassengerId'].isin([62,830]), 'Embarked']

61     C
829    C
Name: Embarked, dtype: object

In [8]:
# FareのPclassごとの平均算出
df2['Fare'].groupby(df2['Pclass']).mean()

Pclass
1    94.280297
2    22.202104
3    12.459678
Name: Fare, dtype: float64

In [9]:
# Fareの欠損値レコード確認
df2[df2.Fare.isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [10]:
df2.loc[df2['PassengerId'] == 1044, 'Fare']

152   NaN
Name: Fare, dtype: float64

In [11]:
# 欠損値補完（平均値使用）
df2.loc[df2['PassengerId'] == 1044, 'Fare'] = 12.45

In [12]:
df2.loc[df2['PassengerId'] == 1044, 'Fare']

152    12.45
Name: Fare, dtype: float64

In [13]:
# 欠損値補完用関数作成（PclassごとにAge補完）
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    # Ageがnullなら
    if pd.isnull(Age):
        # Pclassが1なら39を返す
        if Pclass == 1: return 39
        if Pclass == 2: return 30
        if Pclass == 3: return 25
    # Ageがnullでなければ
    else:
        # Ageをそのまま返す
        return Age


In [14]:
# df1, df2をまとめて処理するためfor文作成
for df in [df1, df2]:
    # Ageを前述の関数で補完
    df['Age'] = df[['Age', 'Pclass']].apply(impute_age, axis=1)
    # Sexを文字列から数値に変換
    df['Sex'] = df['Sex'].map({'male':0, 'female':1})
    # Embarkedを文字列から数値に変換
    df['Embarked'] = df['Embarked'].map({'S':0, 'C':1, 'Q':2})
    # Fareをカテゴリ変数化（値域に応じて4値の整数化）
    df.loc[df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare'] = 2
    df.loc[(df['Fare'] > 31), 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)
    # Ageをカテゴリ変数化（値域に応じて4値の整数化）
    df.loc[df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48), 'Age'] = 3
    df['Age'] = df['Age'].astype(int)
    # 特徴量作成（FamilySize。自分と配偶者、子供、親兄弟を含めた家族全体の人数）
    df['FamilySize'] = 1 + df['SibSp'] + df['Parch']
    # 特徴量作成（IsAlone。親兄弟等の同乗が無く一人の場合に1、それ以外は0）
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

In [15]:
# 不要な特徴量削除
df1 = df1.drop(['Name', 'Cabin', 'Ticket', 'SibSp', 'Parch'], axis=1)

In [16]:
df1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize,IsAlone
0,1,0,3,0,1,0,0,2,0
1,2,1,1,1,2,3,1,2,0
2,3,1,3,1,1,1,0,1,1
3,4,1,1,1,2,3,0,2,0
4,5,0,3,0,2,1,0,1,1


In [17]:
# 不要な特徴量削除
df2 = df2.drop(['Name', 'Cabin', 'Ticket', 'SibSp', 'Parch'], axis=1)

In [18]:
df2.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,FamilySize,IsAlone
0,892,3,0,2,0,2,1,1
1,893,3,1,2,0,0,2,0
2,894,2,0,3,1,2,1,1
3,895,3,0,1,1,0,1,1
4,896,3,1,1,1,0,3,0


In [19]:
# 訓練データax, 訓練ラベルay, テストデータex作成
ax = df1.drop(['PassengerId', 'Survived'], axis=1)
ay = df1['Survived']
ex = df2.drop(['PassengerId'], axis=1)

In [20]:
# 機械学習モデル構築
import lightgbm as lgb
m = lgb.LGBMClassifier(objective='binary', learning_rate=0.1, num_leaves=300)

'''
# ランダムフォレストを使用する場合
from sklearn.ensemble import RandomForestClassifier
m = RandomForestClassifier(random_state=0)
'''

# 学習・予測
m.fit(ax,ay)
py = m.predict(ex)

In [21]:
# 提出用データ作成
s = pd.DataFrame({
    'PassengerId': df2['PassengerId'],
    'Survived': py
})
# CSV化
s.to_csv('s1117_2.csv', index=False)