In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import GridSearchCV

In [2]:
# 1、准备数据
titanic = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")

In [4]:
titanic.head()

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male


In [5]:
# 2、基本的数据处理
x = titanic[["pclass", "age", "sex"]]
y = titanic["survived"]

In [7]:
x.head()

Unnamed: 0,pclass,age,sex
0,1st,29.0,female
1,1st,2.0,female
2,1st,30.0,male
3,1st,25.0,female
4,1st,0.9167,male


In [8]:
y.head()

0    1
1    0
2    0
3    0
4    1
Name: survived, dtype: int64

In [10]:
# 处理缺失值
x["age"].fillna(x["age"].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [14]:
# 把特征转换成字典形式
x = x.to_dict(orient="records")

In [18]:
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [20]:
# 特征工程：字典的特征抽取
transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [29]:
# 决策树预估器流程
# 1）实例化一个估计器类
estimator = DecisionTreeClassifier(criterion="entropy")
# 2）调用fit(训练集)
estimator.fit(x_train, y_train)

# 5、模型评估
# 方法1：比对真实值和预测值
y_predict = estimator.predict(x_test)
print("比对真实值和预测值：\n", y_test == y_predict)
# 方法2：直接计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

# 可视化决策树
export_graphviz(estimator, out_file="titanic.dot", feature_names=transfer.get_feature_names())

比对真实值和预测值：
 441      True
545      True
963      True
1044     True
1291     True
939      True
94       True
1293    False
1115     True
576      True
455      True
1278     True
532      True
555      True
1064     True
353      True
1061     True
518     False
435      True
632      True
1150     True
1235    False
627     False
305      True
623      True
202      True
146     False
1298     True
1146     True
1294     True
        ...  
165      True
1008     True
119      True
783     False
207     False
899      True
76       True
610     False
583      True
418      True
288      True
653      True
442     False
1065     True
428      True
129     False
758      True
282      True
15      False
427      True
473     False
88       True
55       True
748     False
1256     True
622      True
965      True
1038     True
1284     True
853      True
Name: survived, Length: 329, dtype: bool
准确率为：
 0.7507598784194529


In [32]:
from sklearn.ensemble import RandomForestClassifier
# 随机森林预估器流程
estimator = RandomForestClassifier()
# 选择合适的超参数 - 网格搜索
param_dict = {"n_estimators": [120,200,300,500,800,1200], "max_depth":[5,8,15,25,30]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

# 模型评估
# 方法1：比对真实值和预测值
y_predict = estimator.predict(x_test)
print("比对真实值和预测值：\n", y_test == y_predict)
# 方法2：直接计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

# 结果分析
print("在交叉验证中验证的最好结果:\n", estimator.best_score_)
print("最好的参数模型:\n", estimator.best_estimator_)
print("每次交叉验证后的验证集准确率结果和训练集准确率结果:\n", estimator.cv_results_)

比对真实值和预测值：
 441      True
545      True
963      True
1044     True
1291     True
939      True
94       True
1293    False
1115     True
576      True
455      True
1278     True
532      True
555      True
1064     True
353      True
1061     True
518     False
435      True
632      True
1150     True
1235    False
627     False
305      True
623      True
202      True
146     False
1298     True
1146     True
1294     True
        ...  
165      True
1008     True
119      True
783     False
207     False
899      True
76       True
610      True
583      True
418      True
288      True
653      True
442     False
1065     True
428      True
129     False
758      True
282      True
15       True
427      True
473     False
88       True
55       True
748     False
1256     True
622      True
965      True
1038     True
1284     True
853      True
Name: survived, Length: 329, dtype: bool
准确率为：
 0.7750759878419453
在交叉验证中验证的最好结果:
 0.8414634146341463
最好的参数模型:
 RandomForestClassifier