In [35]:
import pickle
import warnings
import logging

import pandas as pd
import luigi
from luigi.util import requires
from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# pandas系
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 300)
pd.options.display.float_format = '{:.5f}'.format

logger = logging.getLogger()

In [9]:
df = datasets.fetch_openml("titanic", version=1, as_frame=True, return_X_y=False).frame

In [10]:
def sklearn_oh_encoder(df, cols, drop_col=False):
        """カテゴリ変換
        sklearnのOneHotEncoderでEncodingを行う

        Args:
            df: カテゴリ変換する対象のデータフレーム
            cols (list of str): カテゴリ変換する対象のカラムリスト
            drop_col (bool): エンコード対象のカラムを削除するか否か

        Returns:
            pd.Dataframe: dfにカテゴリ変換したカラムを追加したデータフレーム
        """
        output_df = df.copy()
        for col in cols:
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
            ohe_df = pd.DataFrame((ohe.fit_transform(output_df[[col]])), columns=ohe.categories_[0])
            ohe_df = ohe_df.add_prefix(f'{col}_')
            # 元のDFに結合
            output_df = pd.concat([output_df, ohe_df], axis=1)
            if drop_col:
                output_df = output_df.drop(col, axis=1)
        return output_df

In [11]:
# 欠損値処理
df.loc[:, 'age'] = df['age'].fillna(df['age'].mean())
df.loc[:, 'fare'] = df['fare'].fillna(df['fare'].mean())

# カテゴリエンコード
categorical_cols = ["pclass", "sex", "embarked"]
df = sklearn_oh_encoder(df=df, cols=categorical_cols, drop_col=True)
logger.info(f'After Data shape: {df.shape}')

# 学習に使用するカラムのみを出力
use_cols = [
    'survived',
    'age',
    'sibsp',
    'parch',
    'fare',
    'pclass_1.0',
    'pclass_2.0',
    'pclass_3.0',
    'sex_female',
    'sex_male',
    'embarked_C',
    'embarked_Q',
    'embarked_S',
    'embarked_nan'
]
df = df[use_cols]

In [12]:
df

Unnamed: 0,survived,age,sibsp,parch,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,embarked_nan
0,1,29.00000,0.00000,0.00000,211.33750,1.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
1,1,0.91670,1.00000,2.00000,151.55000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
2,0,2.00000,1.00000,2.00000,151.55000,1.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
3,0,30.00000,1.00000,2.00000,151.55000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
4,0,25.00000,1.00000,2.00000,151.55000,1.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,0,14.50000,1.00000,0.00000,14.45420,0.00000,0.00000,1.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000
1305,0,29.88113,1.00000,0.00000,14.45420,0.00000,0.00000,1.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000
1306,0,26.50000,0.00000,0.00000,7.22500,0.00000,0.00000,1.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000
1307,0,27.00000,0.00000,0.00000,7.22500,0.00000,0.00000,1.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000


In [13]:
train, test = train_test_split(df, test_size=0.3, shuffle=True, stratify=df['survived'], random_state=42)

In [14]:
train

Unnamed: 0,survived,age,sibsp,parch,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,embarked_nan
306,0,54.00000,0.00000,1.00000,77.28750,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
927,0,29.88113,1.00000,0.00000,14.45420,0.00000,0.00000,1.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000
642,0,13.00000,4.00000,2.00000,31.38750,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
1294,0,28.50000,0.00000,0.00000,16.10000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
1015,0,55.50000,0.00000,0.00000,8.05000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,1,31.00000,0.00000,0.00000,21.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
1080,1,29.88113,0.00000,0.00000,7.75000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000
117,1,30.00000,0.00000,0.00000,56.92920,1.00000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000
759,1,36.00000,1.00000,0.00000,17.40000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000


In [15]:
test

Unnamed: 0,survived,age,sibsp,parch,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,embarked_nan
1289,0,21.00000,1.00000,0.00000,6.49580,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
352,1,40.00000,1.00000,1.00000,39.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
1189,1,4.00000,1.00000,1.00000,16.70000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
78,1,64.00000,0.00000,2.00000,83.15830,1.00000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000
974,0,30.00000,1.00000,0.00000,16.10000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,1,24.00000,0.00000,0.00000,13.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
39,0,48.00000,0.00000,0.00000,50.49580,1.00000,0.00000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000
603,1,35.00000,1.00000,1.00000,20.25000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
574,0,29.00000,1.00000,0.00000,21.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000


In [16]:
target_col = 'survived'
X_train = train.drop(target_col, axis=1)
y_train = train[target_col]

In [17]:
X_train

Unnamed: 0,age,sibsp,parch,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,embarked_nan
306,54.00000,0.00000,1.00000,77.28750,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
927,29.88113,1.00000,0.00000,14.45420,0.00000,0.00000,1.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000
642,13.00000,4.00000,2.00000,31.38750,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
1294,28.50000,0.00000,0.00000,16.10000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
1015,55.50000,0.00000,0.00000,8.05000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,31.00000,0.00000,0.00000,21.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
1080,29.88113,0.00000,0.00000,7.75000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000
117,30.00000,0.00000,0.00000,56.92920,1.00000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000
759,36.00000,1.00000,0.00000,17.40000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000


In [18]:
y_train

306     0
927     0
642     0
1294    0
1015    0
       ..
577     1
1080    1
117     1
759     1
2       0
Name: survived, Length: 916, dtype: category
Categories (2, object): ['0', '1']

In [27]:
X_valid = test.drop(target_col, axis=1)
y_valid = test[target_col]

In [28]:
X_valid

Unnamed: 0,age,sibsp,parch,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,embarked_nan
1289,21.00000,1.00000,0.00000,6.49580,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
352,40.00000,1.00000,1.00000,39.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
1189,4.00000,1.00000,1.00000,16.70000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
78,64.00000,0.00000,2.00000,83.15830,1.00000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000
974,30.00000,1.00000,0.00000,16.10000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,24.00000,0.00000,0.00000,13.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
39,48.00000,0.00000,0.00000,50.49580,1.00000,0.00000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000
603,35.00000,1.00000,1.00000,20.25000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
574,29.00000,1.00000,0.00000,21.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000


In [29]:
y_valid

1289    0
352     1
1189    1
78      1
974     0
       ..
349     1
39      0
603     1
574     0
764     0
Name: survived, Length: 393, dtype: category
Categories (2, object): ['0', '1']

# 学習

In [30]:
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [32]:
y_pred = model.predict(X_valid)

In [34]:
#テストデータの正解率
accuracy_score(y_valid, y_pred)

0.7786259541984732

In [37]:
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       243
           1       0.71      0.71      0.71       150

    accuracy                           0.78       393
   macro avg       0.77      0.76      0.77       393
weighted avg       0.78      0.78      0.78       393



In [40]:
with open('data/processing_titanic_train.pkl', 'rb', encoding="utf-8", errors='ignore') as f:
    valid = pickle.load(f)

ValueError: binary mode doesn't take an encoding argument

In [43]:
with open('data/processing_titanic_test.pkl', mode='rb') as f:
    mystr = pickle.load(f)

In [44]:
mystr

Unnamed: 0,survived,age,sibsp,parch,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,embarked_nan
1289,0,21.00000,1.00000,0.00000,6.49580,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
352,1,40.00000,1.00000,1.00000,39.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
1189,1,4.00000,1.00000,1.00000,16.70000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
78,1,64.00000,0.00000,2.00000,83.15830,1.00000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000
974,0,30.00000,1.00000,0.00000,16.10000,0.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,1,24.00000,0.00000,0.00000,13.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
39,0,48.00000,0.00000,0.00000,50.49580,1.00000,0.00000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000
603,1,35.00000,1.00000,1.00000,20.25000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000
574,0,29.00000,1.00000,0.00000,21.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000
