In [26]:
# 泰坦尼克号乘客生存预测

In [27]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [28]:
# 获取数据
titanic = pd.read_csv("../data/titanic.csv")
titanic

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0000,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0000,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0000,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0000,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male
...,...,...,...,...,...,...,...,...,...,...,...
1308,1309,3rd,0,"Zakarian, Mr Artun",,,,,,,male
1309,1310,3rd,0,"Zakarian, Mr Maprieder",,,,,,,male
1310,1311,3rd,0,"Zenn, Mr Philip",,,,,,,male
1311,1312,3rd,0,"Zievens, Rene",,,,,,,female


In [29]:
# 筛选特征值与目标值
x = titanic[["pclass", "age", "sex"]]
y = titanic["survived"]
x

Unnamed: 0,pclass,age,sex
0,1st,29.0000,female
1,1st,2.0000,female
2,1st,30.0000,male
3,1st,25.0000,female
4,1st,0.9167,male
...,...,...,...
1308,3rd,,male
1309,3rd,,male
1310,3rd,,male
1311,3rd,,female


In [30]:
y

0       1
1       0
2       0
3       0
4       1
       ..
1308    0
1309    0
1310    0
1311    0
1312    0
Name: survived, Length: 1313, dtype: int64

In [31]:
# 缺失值处理
x["age"].fillna(x["age"].mean(), inplace=True)
x

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["age"].fillna(x["age"].mean(), inplace=True)


Unnamed: 0,pclass,age,sex
0,1st,29.000000,female
1,1st,2.000000,female
2,1st,30.000000,male
3,1st,25.000000,female
4,1st,0.916700,male
...,...,...,...
1308,3rd,31.194181,male
1309,3rd,31.194181,male
1310,3rd,31.194181,male
1311,3rd,31.194181,female


In [32]:
# 特征值转成字典
x = x.to_dict(orient="records")
x

[{'pclass': '1st', 'age': 29.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 2.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 30.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 25.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 0.9167, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 63.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 39.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 58.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 71.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 19.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'female'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '1st', 'age': 50.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 24.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 36.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 37.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 

In [33]:
# 数据集划分
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)

In [34]:
# 字典特征抽取
transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
x_train

array([[19.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [28.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [17.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       ...,
       [31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [36.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ]])

In [35]:
# 字典特征抽取后的特征名称
transfer.get_feature_names_out()

array(['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female',
       'sex=male'], dtype=object)

In [36]:
# 随机森林估计器
estimator = RandomForestClassifier()
# 网格搜索
param_dict = {"n_estimators": [120,200,300,500,800,1200], "max_depth": [5,8,15,25,30]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

In [37]:
# 模型评估
# 方法1：直接对比真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接对比真实值和预测值：\n", y_test == y_predict)

# 方法2：计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

y_predict:
 [0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0
 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1]
直接对比真实值和预测值：
 831      True
261      True
1210     True
1155     True
255      True
        ...  
1146     True
1125    False
386      True
1025    False
337      True
Name: survived, Length: 329, dtype: bool
准确率为：
 0.790273556231003


In [38]:
# 最佳参数
print("最佳参数：\n", estimator.best_params_)
# 最佳结果
print("最佳结果：\n", estimator.best_score_)
# 最佳估计器
print("最佳估计器:\n", estimator.best_estimator_)
# 交叉验证结果
print("交叉验证结果：\n", estimator.cv_results_)

最佳参数：
 {'max_depth': 5, 'n_estimators': 200}
最佳结果：
 0.8363821138211381
最佳估计器:
 RandomForestClassifier(max_depth=5, n_estimators=200)
交叉验证结果：
 {'mean_fit_time': array([0.12569483, 0.17203848, 0.27206095, 0.43276342, 0.69782297,
       1.06323838, 0.11102478, 0.18237448, 0.26672649, 0.45243494,
       0.75050187, 1.11791746, 0.11769287, 0.19571034, 0.28139599,
       0.46810516, 0.74483355, 1.10891533, 0.11002461, 0.18370779,
       0.27839581, 0.4587694 , 0.73216422, 1.09191124, 0.10835759,
       0.18137407, 0.28006252, 0.4664379 , 0.74216612, 1.10191361]), 'std_fit_time': array([2.71396712e-02, 1.41456064e-03, 4.24311992e-03, 9.39452047e-03,
       2.78313171e-02, 1.18627418e-02, 8.16437433e-04, 3.09203621e-03,
       2.86792682e-03, 1.02116943e-02, 1.72154288e-02, 1.39157385e-02,
       6.60092704e-03, 5.43740427e-03, 1.88643677e-03, 4.08345235e-03,
       3.77225534e-03, 1.71762632e-02, 2.97360213e-07, 3.40006629e-03,
       3.30032302e-03, 8.38166525e-03, 2.16104684e-03, 1.48207563