In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn import metrics

In [2]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [3]:
print('查看数据信息：列名、非空个数、类型等')
print(train_data.info())
print('-'*30)
print('查看数据摘要')
print(train_data.describe())
print('-'*30)
print('查看离散数据分布')
print(train_data.describe(include=['O']))
print('-'*30)
print('查看前5条数据')
print(train_data.head())
print('-'*30)
print('查看后5条数据')
print(train_data.tail())

查看数据信息：列名、非空个数、类型等
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
------------------------------
查看数据摘要
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%

In [4]:
# 使用平均年龄来填充年龄中的nan值
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)
# 使用票价的均值填充票价中的nan值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)

In [5]:
print(train_data['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [6]:
# 使用登录最多的港口来填充登录港口的nan值
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S',inplace=True)

In [7]:
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]
print('特征值')
print(train_features)

特征值
     Pclass     Sex        Age  SibSp  Parch      Fare Embarked
0         3    male  22.000000      1      0    7.2500        S
1         1  female  38.000000      1      0   71.2833        C
2         3  female  26.000000      0      0    7.9250        S
3         1  female  35.000000      1      0   53.1000        S
4         3    male  35.000000      0      0    8.0500        S
5         3    male  29.699118      0      0    8.4583        Q
6         1    male  54.000000      0      0   51.8625        S
7         3    male   2.000000      3      1   21.0750        S
8         3  female  27.000000      0      2   11.1333        S
9         2  female  14.000000      1      0   30.0708        C
10        3  female   4.000000      1      1   16.7000        S
11        1  female  58.000000      0      0   26.5500        S
12        3    male  20.000000      0      0    8.0500        S
13        3    male  39.000000      1      5   31.2750        S
14        3  female  14.000000      

In [8]:
dvec=DictVectorizer(sparse=False)
train_features=dvec.fit_transform(train_features.to_dict(orient='record'))
print(dvec.feature_names_)
test_features=dvec.transform(test_features.to_dict(orient='record'))

['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


## 使用TPOT方法

In [9]:
from tpot import TPOTClassifier

In [10]:
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(train_features, train_labels)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=120, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: 0.83504910122644
Generation 2 - Current best internal CV score: 0.83504910122644
Generation 3 - Current best internal CV score: 0.83504910122644
Generation 4 - Current best internal CV score: 0.8384200295979802
Generation 5 - Current best internal CV score: 0.8384200295979802

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.1, max_depth=7, min_child_weight=3, n_estimators=100, nthread=1, subsample=0.7000000000000001)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
        disable_update_check=False, early_stop=None, generations=5,
        max_eval_time_mins=5, max_time_mins=None, memory=None,
        mutation_rate=0.9, n_jobs=1, offspring_size=None,
        periodic_checkpoint_folder=None, population_size=20,
        random_state=None, scoring=None, subsample=1.0, template=None,
        use_dask=False, verbosity=2, warm_start=False)

In [11]:
acc_tpot = tpot.score(train_features, train_labels)
print(u'score准确率为 %.4lf' % acc_tpot)

tpot.export('tpot_mnist_pipeline.py')

score准确率为 0.9024


## 使用XGbosst模型（tpot方法得到模型）

In [19]:
from xgboost import XGBClassifier

In [20]:
xgb = XGBClassifier(
    learning_rate=0.1, 
    max_depth=7, 
    min_child_weight=3, 
    n_estimators=100, 
    nthread=1, subsample=0.7000000000000001)

# 决策树训练
xgb.fit(train_features, train_labels)

# 得到XGBClassifier准确率(基于训练集)
acc_xgb = round(xgb.score(train_features, train_labels), 6)
print(u'score准确率为 %.4lf' % acc_xgb)

# 使用K折交叉验证 统计决策树准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(xgb, train_features, train_labels, cv=10)))

score准确率为 0.9024
cross_val_score准确率为 0.8284


## 使用决策树模型

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [14]:
# 构造ID3决策树
clf = DecisionTreeClassifier(criterion='entropy')
# 决策树训练
clf.fit(train_features, train_labels)

# 得到决策树准确率(基于训练集)
acc_decision_tree = round(clf.score(train_features, train_labels), 6)
print(u'score准确率为 %.4lf' % acc_decision_tree)

# 使用K折交叉验证 统计决策树准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

score准确率为 0.9820
cross_val_score准确率为 0.7836


## 使用支持向量机

In [15]:
from sklearn.svm import SVC

In [18]:
# 构造ID3决策树
svc = SVC()
# 决策树训练
svc.fit(train_features, train_labels)

# 得到SVC准确率(基于训练集)
acc_SVC = round(svc.score(train_features, train_labels), 6)
print(u'score准确率为 %.4lf' % acc_SVC)

# 使用K折交叉验证 统计决策树准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(svc, train_features, train_labels, cv=10)))



score准确率为 0.8900




cross_val_score准确率为 0.7264


## 预测

In [22]:
pred_labels = tpot.predict(test_features)

In [26]:
test_data['Survived '] = pred_labels
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.50000,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.00000,1,0,363272,7.0000,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.00000,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.00000,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.00000,1,1,3101298,12.2875,,S,0
5,897,3,"Svensson, Mr. Johan Cervin",male,14.00000,0,0,7538,9.2250,,S,0
6,898,3,"Connolly, Miss. Kate",female,30.00000,0,0,330972,7.6292,,Q,1
7,899,2,"Caldwell, Mr. Albert Francis",male,26.00000,1,1,248738,29.0000,,S,0
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.00000,0,0,2657,7.2292,,C,1
9,901,3,"Davies, Mr. John Samuel",male,21.00000,2,0,A/4 48871,24.1500,,S,0
