## データの読み込み

In [211]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')  #忽略警告信息

In [212]:
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/GCI2023/01.competition1/data"

df_train = pd.read_csv(path + '/train.csv')
df_test = pd.read_csv(path + '/test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## データの前処理

### 欠損値の処理

In [213]:
df_train.isnull().sum() #缺失值检验

PassengerId      0
Perished         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [214]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [215]:
age = pd.concat([df_train['Age'], df_test['Age']]) #两个数据合在一起
fare = pd.concat([df_train['Fare'], df_test['Fare']])

#df_train['Age'].fillna(age.mean(), inplace=True) #用平均值填充
#df_test['Age'].fillna(age.mean(), inplace=True)

train_value = df_train['Age'].dropna().mode() #用众数填充
df_train['Age'].fillna(24,inplace=True)

test_value = df_test['Age'].dropna().mode().mean()
df_test['Age'].fillna(22.5,inplace=True)

df_train['Fare'].fillna(fare.mean(), inplace=True)
df_test['Fare'].fillna(fare.mean(), inplace=True)


In [216]:
df_train.drop('Cabin', axis=1, inplace=True) #删掉cabin的缺失值
df_test.drop('Cabin', axis=1, inplace=True)

In [217]:
df_train['Embarked'].fillna('S', inplace=True)
df_test['Embarked'].fillna('S', inplace=True)

### 列の処理

In [218]:
#删除列
df_train.drop('Name', axis=1, inplace=True)
df_test.drop('Name', axis=1, inplace=True)

df_train.drop('Ticket', axis=1, inplace=True)
df_test.drop('Ticket', axis=1, inplace=True)

In [219]:
#sex：dummy variables
df_train.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)
df_test.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)

In [220]:
#embarked: dummy variables embarked——C Q S
embarked = pd.concat([df_train['Embarked'], df_test['Embarked']]) #合并

embarked_ohe = pd.get_dummies(embarked) #one-hot vector

embarked_ohe_train = embarked_ohe[:891]
embarked_ohe_test = embarked_ohe[891:]

df_train = pd.concat([df_train, embarked_ohe_train], axis=1)
df_test = pd.concat([df_test, embarked_ohe_test], axis=1)

df_train.drop('Embarked', axis=1, inplace=True) #删除embarked列
df_test.drop('Embarked', axis=1, inplace=True)

## モデルの構築

In [221]:
X = df_train.iloc[:, 2:].values #所有行，第二列之后的列
y = df_train.iloc[:, 1].values

X_test = df_test.iloc[:, 1:].values #test里面没有perished 换成numpy.ndarray型

In [222]:
#为了防止过拟合，将原始数据进行分割，用valid测试准确度（看二者准确值是否差异过大）
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=25)

In [223]:
#ランダムフォレストモデル作成
rfc = RandomForestClassifier(max_depth=10, min_samples_leaf=1, n_estimators=100, n_jobs=-1, random_state=25)
rfc.fit(X_train, y_train)

In [224]:
#精确度确认
#max_depth（决策树的最大深度）和min_samples_leaf（叶子节点所包含的最小样本数）等就是超参数。
#max_depth的值设置得很大会增加过拟合的倾向，但如果max_depth的值太小，模型的准确度将很低（欠拟合）
#Train Score: 0.955， Test Score: 0.784，存在过拟合现象，需要处理

print('Train Score: {}'.format(round(rfc.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(rfc.score(X_valid, y_valid), 3)))

Train Score: 0.945
 Test Score: 0.825


In [225]:
#网格搜索是一种通过基于离散指定的一组超参数组合，全面评估模型预测精度的方法，以同时优化多个超参数
param_grid = {'max_depth': [3, 5, 7],
              'min_samples_leaf': [1, 2, 4]}

for max_depth in param_grid['max_depth']:
    for min_samples_leaf in param_grid['min_samples_leaf']:
        rfc_grid = RandomForestClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf,
                                            n_estimators=100, n_jobs=-1, random_state=25)
        rfc_grid.fit(X_train, y_train)
        print('max_depth: {}, min_samples_leaf: {}'.format(max_depth, min_samples_leaf))
        print('    Train Score: {}, Test Score: {}'.format(round(rfc_grid.score(X_train, y_train), 3),
                                                          round(rfc_grid.score(X_valid, y_valid), 3)))

max_depth: 3, min_samples_leaf: 1
    Train Score: 0.841, Test Score: 0.813
max_depth: 3, min_samples_leaf: 2
    Train Score: 0.839, Test Score: 0.817
max_depth: 3, min_samples_leaf: 4
    Train Score: 0.843, Test Score: 0.813
max_depth: 5, min_samples_leaf: 1
    Train Score: 0.86, Test Score: 0.81
max_depth: 5, min_samples_leaf: 2
    Train Score: 0.857, Test Score: 0.81
max_depth: 5, min_samples_leaf: 4
    Train Score: 0.852, Test Score: 0.806
max_depth: 7, min_samples_leaf: 1
    Train Score: 0.907, Test Score: 0.817
max_depth: 7, min_samples_leaf: 2
    Train Score: 0.881, Test Score: 0.806
max_depth: 7, min_samples_leaf: 4
    Train Score: 0.872, Test Score: 0.802


In [226]:
#grid search+ cross validation，将数据集分成5份（4个用作训练，一个用作检验，然后四次检验的平均进行比较）
param_grid = {'max_depth': [3, 5, 7, 9, 11],
              'min_samples_leaf': [1, 2, 4, 6]}

rfc_gs = GridSearchCV(RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=25), param_grid, cv=5)
rfc_gs.fit(X, y)

print('Best Parameters: {}'.format(rfc_gs.best_params_))
print('CV Score: {}'.format(round(rfc_gs.best_score_, 3)))

Best Parameters: {'max_depth': 11, 'min_samples_leaf': 2}
CV Score: 0.837


## 特徴量エンジニアリング

通过对自变量进行处理来尝试创建对模型学习有效的变量（特征）的过程\
一个是使用与问题设定无关的通用方法，如对分类数据进行编码或创建交叉项等。\
另一个方向是基于领域知识进行特征工程。领域知识是指与每个特定问题相关的背景知识。


In [227]:
#创建family+SibSp+Parch变量
df_fe_train = df_train.copy()
df_fe_test = df_test.copy()

df_fe_train['Family'] = df_train['SibSp'] + df_train['Parch']
df_fe_test['Family'] = df_test['SibSp'] + df_test['Parch']

df_fe_train['Afare'] = df_train['Age'] * df_train['Fare']
df_fe_test['Afare'] = df_test['Age'] * df_test['Fare']

df_fe_train['Afamily'] = df_train['Age'] * df_fe_train['Family']
df_fe_test['Afamliy'] = df_test['Age'] * df_fe_test['Family']

df_fe_train['Age2'] = df_train['Age']**2
df_fe_test['Age2'] = df_test['Age']**2

df_fe_train['Family2'] = df_fe_train['Family']**2
df_fe_test['Family2'] = df_fe_test['Family']**2

In [228]:
#带着family变量，再做一次model，用最新的引数做
X_fe_train = df_fe_train.iloc[:, 2:].values
y_fe_train = df_fe_train.iloc[:, 1].values

X_fe_test = df_fe_test.iloc[:, 1:].values

X_fe_train, X_fe_valid, y_fe_train, y_fe_valid = train_test_split(X_fe_train, y_fe_train, test_size=0.3, random_state=25)

rfc_fe = RandomForestClassifier(max_depth=11, min_samples_leaf=2, n_estimators=100, n_jobs=-1, random_state=25)
rfc_fe.fit(X_fe_train, y_fe_train)

print('Train Score: {}'.format(round(rfc_fe.score(X_fe_train, y_fe_train), 3)))
print(' Test Score: {}'.format(round(rfc_fe.score(X_fe_valid, y_fe_valid), 3)))

Train Score: 0.926
 Test Score: 0.799


## 他のモデル

In [259]:
#logistic model
lr = LogisticRegression(random_state=25)
lr.fit(X_fe_train, y_fe_train)

print('Logistic Regression \n')
print('Train Score: {}'.format(round(lr.score(X_fe_train, y_fe_train), 3)))
print(' Test Score: {}'.format(round(lr.score(X_fe_valid, y_fe_valid), 3)))

Logistic Regression 

Train Score: 0.811
 Test Score: 0.776


In [265]:
#多層パーセプトロンモデル
mlpc = MLPClassifier(hidden_layer_sizes=(100, 100, 10), random_state=25)
mlpc.fit(X_train, y_train)

print('Multilayer Perceptron \n')
print('Train Score: {}'.format(round(mlpc.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(mlpc.score(X_valid, y_valid), 3)))

Multilayer Perceptron 

Train Score: 0.831
 Test Score: 0.784


## モデルのアンサンプリング

アンサンブリング：将多个模型组合成一个模型的方法。集成方法包括Bagging、Boosting、Stacking等多种技术，但最简单的方法是将多个模型的预测值进行加权平均，得到最终的预测值。

In [266]:
#将三个模型的预测值取算术平均并四舍五入
rfc_pred = rfc_fe.predict_proba(X_fe_test)
lr_pred = lr.predict_proba(X_fe_test)
mlpc_pred = mlpc.predict_proba(X_test)


pred_proba = (rfc_pred+lr_pred+mlpc_pred)/3
pred = pred_proba.argmax(axis=1)

## 予測の出力・提出

In [268]:
path = "/content/drive/MyDrive/GCI2023/01.competition1/"
submission = pd.read_csv(path + 'gender_submission.csv')

In [269]:
submission['Survived'] = 1-pred

In [270]:
from google.colab import files
# colaboratory上に保存
# 保存したcsvファイルはランタイムが終了すると削除されます
submission.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>