In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pyplot as plt
%matplotlib inline

# 读取数据

In [4]:
df = pd.read_csv("./data/泰坦尼克号/titanic_train.csv")


In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# 处理数据

## 删除多余的特征

In [8]:
df.drop(['Name', 'Ticket', 'Cabin'], inplace=True, axis=1)

In [9]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


## 将非数值型的转换为数值型

In [10]:
df['Sex'] = (df['Sex'] == 'male').astype('int')

In [11]:
# 查看Embarked的取值
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [12]:
labels = {
    'S': 0,
    'C': 1,
    'Q': 2
}
df['Embarked'] = df['Embarked'].map(labels)

## 处理缺失值

In [13]:
# 查看每个特征缺失值的数量
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64

In [14]:
# 将Embarked的缺失值数据删除
df.drop(df[df['Embarked'].isnull()].index, inplace=True)

In [15]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         0
dtype: int64

### 使用随机森林对Age的缺失值进行填补

In [16]:
df_y_train = df['Age'][df['Age'].notnull()]

In [17]:
df_y_test = df['Age'][df['Age'].isnull()]

In [18]:
df_x = df.iloc[:, df.columns != 'Age']

In [19]:
df_x_train = df_x.loc[df_y_train.index, :]

In [20]:
df_x_test = df_x.loc[df_y_test.index, :]

In [21]:
df_x_test

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
5,6,0,3,1,0,0,8.4583,2.0
17,18,1,2,1,0,0,13.0000,0.0
19,20,1,3,0,0,0,7.2250,1.0
26,27,0,3,1,0,0,7.2250,1.0
28,29,1,3,0,0,0,7.8792,2.0
...,...,...,...,...,...,...,...,...
859,860,0,3,1,0,0,7.2292,1.0
863,864,0,3,0,8,2,69.5500,0.0
868,869,0,3,1,0,0,9.5000,0.0
878,879,0,3,1,0,0,7.8958,0.0


In [22]:
pd.DataFrame(df_y_train).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     712 non-null    float64
dtypes: float64(1)
memory usage: 11.1 KB


In [23]:
# 进行训练并填补异常值
clf = RandomForestRegressor(random_state=0, n_estimators=100)
clf = clf.fit(df_x_train, df_y_train)
df_y_test_p = clf.predict(df_x_test).astype('int')

In [24]:
df.loc[df_y_test.index, 'Age'] = df_y_test_p

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Sex          889 non-null    int32  
 4   Age          889 non-null    float64
 5   SibSp        889 non-null    int64  
 6   Parch        889 non-null    int64  
 7   Fare         889 non-null    float64
 8   Embarked     889 non-null    float64
dtypes: float64(3), int32(1), int64(5)
memory usage: 106.0 KB


# 训练模型

In [26]:
y = df['Survived']
x = df.drop('Survived', axis=1)

In [27]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.3)

In [28]:
rfc = RandomForestClassifier(random_state=0)
rfc = rfc.fit(Xtrain, Ytrain)
rfc.score(Xtest, Ytest)

0.8239700374531835

# 处理需要预测的数据并写入文件

In [29]:
df_test = pd.read_csv("./data/泰坦尼克号/titanic_test.csv")

In [30]:
df_test.drop(['Name', 'Ticket', 'Cabin'], inplace=True, axis=1)

In [31]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [32]:
df_test['Sex'] = (df_test['Sex'] == 'male').astype('int')

In [33]:
df_test['Embarked'] = df_test['Embarked'].map(labels)

In [34]:
df_test

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,2
1,893,3,0,47.0,1,0,7.0000,0
2,894,2,1,62.0,0,0,9.6875,2
3,895,3,1,27.0,0,0,8.6625,0
4,896,3,0,22.0,1,1,12.2875,0
...,...,...,...,...,...,...,...,...
413,1305,3,1,,0,0,8.0500,0
414,1306,1,0,39.0,0,0,108.9000,1
415,1307,3,1,38.5,0,0,7.2500,0
416,1308,3,1,,0,0,8.0500,0


In [35]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    int32  
 3   Age          332 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         417 non-null    float64
 7   Embarked     418 non-null    int64  
dtypes: float64(2), int32(1), int64(5)
memory usage: 24.6 KB


## 对缺失值进行处理

In [36]:
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mean())

In [37]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    int32  
 3   Age          332 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         418 non-null    float64
 7   Embarked     418 non-null    int64  
dtypes: float64(2), int32(1), int64(5)
memory usage: 24.6 KB


In [38]:
df_demo = df_test
y_demo = df_demo['Age']
df_demo = df_demo.iloc[:, df_demo.columns != 'Age']
# 分离训练集和预测集
y_demo_train = y_demo.loc[y_demo.notnull()]
y_demo_test = y_demo.loc[y_demo.isnull()]
x_demo_train = df_demo.loc[y_demo_train.index, :]
x_demo_test = df_demo.loc[y_demo_test.index, :]

rfc_demo = RandomForestRegressor(random_state=0, n_estimators=100)
rfc_demo = rfc_demo.fit(x_demo_train, y_demo_train)
y_demo_test = rfc_demo.predict(x_demo_test).astype('int')

In [39]:
df_test.loc[df_test['Age'].isnull(), 'Age'] = y_demo_test

### 未调参测试

In [40]:
rfc = RandomForestClassifier(random_state=0)
rfc = rfc.fit(x, y)
y_submit = rfc.predict(df_test)

In [41]:
submit = pd.DataFrame(df_test['PassengerId'])
submit['Survived'] = y_submit

In [42]:
submit.set_index('PassengerId', inplace=True)

In [43]:
submit

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0
...,...
1305,0
1306,1
1307,0
1308,0


In [44]:
submit.to_csv('./data/泰坦尼克号/titanic_test_submit.csv')

**未调参数的Kaggle提交分数为0.75598**

### 调参测试

In [214]:
# 使用网格搜索寻找最优参数
params = {
    'criterion': ('gini', 'entropy'),
    'n_estimators': [*range(15, 20)],
    'max_depth': [*range(1, 15)],
    'min_samples_leaf': [*range(1, 10)],
    'min_impurity_decrease': [*np.linspace(0, 0.5, 20)],
    'max_features': [*range(2, 8)]
}
clf_gs = RandomForestClassifier(random_state=0)
SG = GridSearchCV(clf_gs, params, cv=10)
SG = SG.fit(x, y)
SG.best_score_

0.850370275791624

In [215]:
SG.best_params_

{'criterion': 'entropy',
 'max_depth': 11,
 'max_features': 6,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 3,
 'n_estimators': 18}

In [236]:
rfc = RandomForestClassifier(random_state=0,
                             criterion='entropy',
                             max_depth=11,
                             min_samples_leaf=3,
                             max_features=6,
                             n_estimators=18)
rfc = rfc.fit(Xtrain, Ytrain)
rfc.score(Xtest, Ytest)

0.8576779026217228

In [242]:
rfc = RandomForestClassifier(random_state=0,
                             criterion='entropy',
                             max_depth=11,
                             min_samples_leaf=3,
                             max_features=6,
                             n_estimators=18)
rfc = rfc.fit(x, y)
predict = rfc.predict(df_test)

In [243]:
predict

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [244]:
submit = pd.DataFrame(df_test['PassengerId'])
submit['Survived'] = predict
submit.set_index('PassengerId', inplace=True)

In [245]:
submit

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [246]:
submit.to_csv('./data/泰坦尼克号/titanic_test_submit2.csv')

**这次的分数为0.76076，比没调参的时候还要搞一些，说明使用网格搜索调参后更加符合比赛数据的预测，但是依旧不是很理想，所以需要在进行手动调参，以当前模型过拟合的方向进行调参**

**调节剪枝问题进行调参**

|参数|分数|
|:-:|:-:|
|random_state=0,criterion='gini',max_depth=10,min_samples_leaf=4,max_features=6,n_estimators=18| 0.77033|
|random_state=0,criterion='gini',max_depth=10,min_samples_leaf=5,max_features=6,n_estimators=18|0.78947|
|random_state=0,criterion='gini',max_depth=9,min_samples_leaf=4,max_features=6,n_estimators=18|0.75119|
|random_state=0,criterion='gini',max_depth=10,min_samples_leaf=5,max_features=7,n_estimators=18|0.78947|
|random_state=0,criterion='gini',max_depth=9,min_samples_leaf=5,max_features=7,n_estimators=18|0.78468|
|random_state=0,criterion='gini',max_depth=8,min_samples_leaf=5,max_features=7,n_estimators=18|0.78947|
|random_state=0,criterion='gini',max_depth=8,min_samples_leaf=5,max_features=7,n_estimators=20|0.78468|
|random_state=0,criterion='entropy',max_depth=8,min_samples_leaf=5,max_features=7,n_estimators=18|0.78468|
|random_state=0,criterion='gini',max_depth=5,min_samples_leaf=5,max_features=7,n_estimators=18|0.77990|
|random_state=0,criterion='gini',max_depth=6,min_samples_leaf=5,max_features=7,n_estimators=18|0.78468|
|random_state=0,criterion='gini',max_depth=7,min_samples_leaf=5,max_features=7,n_estimators=18|0.78468|

In [66]:
rfc = RandomForestClassifier(random_state=0,
                             criterion='entropy',
                             max_depth=5,
                             min_samples_leaf=7,
                             max_features=7,
                             n_estimators=18)
rfc = rfc.fit(x, y)
predict = rfc.predict(df_test)
submit = pd.DataFrame(df_test['PassengerId'])
submit['Survived'] = predict
submit.set_index('PassengerId', inplace=True)
submit

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0
...,...
1305,0
1306,1
1307,0
1308,0


In [67]:
submit.to_csv('./data/泰坦尼克号/titanic_test_submit3.csv')

**以上参数得到的分数为0.78947**