In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.style.use("fivethirtyeight")
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
pd.DataFrame?

## 数据预处理
### 数据初步分析

In [None]:
data_train = pd.read_csv("./data/titanic/train.csv")
data_test = pd.read_csv("./data/titanic/test.csv")
data_set = [data_train, data_test]

print(data_train.info())
print(data_train['Survived'].value_counts()/len(data_train))
data_train.head()

In [None]:
data_test.info()

- PassengerId : 乘客的序号, 类别型变量
- Pclass : 应该是仓位等级,可能跟身份地位挂钩, 类别型变量
- Name : 姓名, 类别型变量
- Age : 数值型连续变量
- SibSp : 兄弟姐妹的数量, 类别
- Parch : 父母或者孩子的数量, 类别
- Ticket : 票的编号
- Fare : 票价
- Cabin : 仓位号 大量缺失
- Embarked : 类别型变量 S, C, Q
- Survived : 目标变量, 二分类

接下来查看变量的分布情况:

In [None]:
plt.figure(figsize=(12,12))
data_col = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
for i, col in enumerate(data_col,1):
    plt.subplot(3, 3, i)
    data_train[col].hist(bins=35, color='blue')
    plt.xlabel(col)
    plt.ylabel('frequent')

查看变量与目标变量之间的关系

In [None]:
plt.figure(figsize=(15,15))
for i, col in enumerate(data_col, 1):
    plt.subplot(3, 3, i)
    data_train[data_train.Survived == 0][col].hist(bins=35, color='blue', alpha=0.5, \
                                                   label = f'{col} with Survive == 0')
    data_train[data_train.Survived == 1][col].hist(bins=35, color='red', alpha=0.5, \
                                                   label = f'{col} with Survive == 1')
    plt.legend()
    plt.xlabel(col)
    plt.ylabel('Frequent')    

- Pclss 仓位的等级是一个影响因素, 可以看到Pclass的数值从小到大,存活率逐渐降低.
- Sex 性别是一个影响因素, Lady First
- Age 年龄越小或者越大应该有较大的存活率
- SibSp 太多会团灭
- Parch 为1或者2时有较高的存活率
- Fare 票价, 有一个票价死亡率很高
- Embarked 从C港登陆的存活率高与其他港

### 缺失变量的分析

In [None]:
data_train.isnull().sum().plot(kind='bar')
plt.xlabel('Train Set Null Count')
plt.show()
data_test.isnull().sum().plot(kind='bar')
plt.xlabel('Test Set Null Count')
plt.show()

首先分析Cabin, Cabin是一个类别型变量,并且应该有很多种类,所以无法进行填值.

根据Cabin是否缺失可以构造出两个变量, 一种是有Cabin数据的, 另一种是没有Cabin.

然后分析它们与Survived之间是否有联系.

In [None]:
len(data_train['Cabin'].unique())

In [None]:
data_train['Cabin_Yes'] = data_train.Cabin.apply(lambda x: 1 if type(x) == str else 0)
data_train['Cabin_No'] = data_train.Cabin.apply(lambda x: 0 if type(x) == str else 1)

In [None]:
data_train.head()

In [None]:
plt.figure(figsize=(5, 5))
data_train[data_train.Survived == 1]['Cabin_Yes'].hist(bins = 10, color = 'red', alpha = 0.5, 
                                                       label = f'Survived = 1')
data_train[data_train.Survived == 0]['Cabin_Yes'].hist(bins = 10, color = 'blue', alpha = 0.5, 
                                                       label = f'Survived = 0')
plt.legend()

可以直观的看出, 有船舱的存活率要高一些

票的等级是否和有无船舱有联系?

In [None]:
plt.figure(figsize = (5,5))
data_train[data_train.Cabin_Yes == 1]['Pclass'].hist(bins = 10, color = 'red', label = 'Cabin Yes', alpha = 0.5)
data_train[data_train.Cabin_Yes == 0]['Pclass'].hist(bins = 10, color = 'blue', label = 'Cabin No', alpha = 0.5)
plt.legend()
plt.xlabel('Pclass')

每个等级都或多或少有缺失船舱的现象, 每个等级也均有船舱, 这个数据应该是一个干扰数据,
因为有船舱的数据的乘客大多是Pclass = 1的乘客, 因此显得存活率比较高. 
所以, 我们将船舱数据给丢弃.

In [None]:
for col in ['Cabin_Yes', 'Cabin_No']:
    if col in data_train.columns:
        data_train.drop(columns = [col], inplace = True)    

# 训练集, 测试集 统一丢弃 Cabin
for data in data_set:
    if 'Cabin' in data.columns:
        data.drop(columns = ['Cabin'], inplace = True)

print(data_train.columns, data_test.columns)

接下来要拟合的数据是Age数据

我们将数据用Name中的title分组, 然后再用每个title的平均值,对缺失年龄数据进行填值.

In [None]:
import re
from collections import defaultdict
TitleMap = defaultdict(int)
TitleAge = defaultdict(int)

title_list = []

if not TitleMap:
    for name in data_train['Name']:
        title = re.match(".*[,](.*?)[.].*", name).group(1).strip()
        TitleMap[title] += 1
        title_list.append(title)
        
df_title = pd.DataFrame(columns = ['Title'], data = title_list)
train_dummies = pd.concat([data_train, df_title], axis = 1)

In [None]:
for title in train_dummies['Title']:
    TitleAge[title] = train_dummies[train_dummies.Title == title]['Age'].mean()

In [None]:
for idx, title in enumerate(train_dummies['Title']):
    if pd.isnull(train_dummies.loc[idx, 'Age']):
        train_dummies.loc[idx, 'Age'] = TitleAge[title]

In [None]:
train_dummies['Age'].hist(bins = 35)

In [None]:
data_train['Age'].hist(bins = 35)

In [None]:
test_title = []

for name in data_test['Name']:
    title = re.match(".*[,](.*?)[.].*", name).group(1).strip()
    test_title.append(title)

df_title = pd.DataFrame(columns = ['Title'], data = test_title)
test_dummies = pd.concat([data_test, df_title], axis = 1)

In [None]:
for idx, title in enumerate(test_dummies['Title']):
    if pd.isnull(test_dummies.loc[idx, 'Age']) and title in TitleAge:
        test_dummies.loc[idx, 'Age'] = TitleAge[title]

In [None]:
data_train = train_dummies
data_test = test_dummies

在训练集中港口还有缺失数据, 我们用常用港口'S' 来填补
在测试集中缺失了一个票价, 我们用平均票价来填补

In [None]:
data_train['Embarked'].fillna(value = 'S', inplace = True)

In [None]:
data_train.isnull().sum()

测试集中还缺失了一个票价数据, 我们根据训练集中得不同Pclass对应得票价来填充数据

In [None]:
MeanFare = defaultdict(int)
for i in range(1, 4):
    MeanFare[i] = data_train[data_train.Pclass == i]['Fare'].mean()

In [None]:
fare_na = data_test[pd.isna(data_test.Fare)]['Fare'].index
for index in fare_na:
    data_test.loc[index, 'Fare'] = MeanFare[data_test.loc[index, 'Pclass']]

In [None]:
data_test.isnull().sum()

至此, 缺失数据全部处理完毕
### 数据深入分析
首先丢弃一些无用项

In [None]:
# 丢弃 passengerId, Name, Ticket
train_dummies = data_train.drop(columns = ['PassengerId', 'Name', 'Ticket'])
train_dummies.head()

In [None]:
test_dummies = data_test.drop(columns = ['PassengerId', 'Name', 'Ticket'])
test_dummies.head()

展开 Pclass 以及 Embarked数据

In [None]:
train_onehot = pd.get_dummies(train_dummies, columns = ['Pclass', 'Embarked', 'Sex'])
test_onehot = pd.get_dummies(test_dummies, columns = ['Pclass', 'Embarked', 'Sex'])

In [None]:
train_onehot.head()

In [None]:
corr_matrix = train_onehot.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(corr_matrix, annot = True, fmt=".2f", cmap="YlGnBu")

In [None]:
plt.figure(figsize=(12,8))
corr_matrix['Survived'].plot(kind='bar')

性别 仓位等级 登陆港口 票价 对 是否存活都有一定的影响力

接下来的问题是, 怎么处理Age 以及 SibSp Parch 这三个数据?

首先分析Age和Survived之间的关系:

In [None]:
plt.figure(figsize=(10,10))
train_onehot[train_onehot.Survived == 1]['Age'].hist(bins = 35, color = 'red', alpha = 0.5, label = 'Survived = 1')
train_onehot[train_onehot.Survived == 0]['Age'].hist(bins = 35, color = 'blue', alpha = 0.5, label = 'Survived = 0')
plt.legend()
plt.xlabel('Age')
plt.ylabel('freq')

小孩子的存活率貌似较高, 30-40岁的存活率较低, 可能是青年男子

方案一 : 将年龄分组 (0, 15] child and not child (15, ...)

方案二 : 标准化, 首先测试这个方案的效果.

接下来处理 SibSp 和 Parch

In [None]:
for col in ['SibSp', 'Parch']:
    plt.figure(figsize=(8,8))
    train_onehot[train_onehot.Survived == 0][col].hist(bins=35, color='blue', alpha=0.5, \
                                                   label = f'{col} with Survive == 0')
    train_onehot[train_onehot.Survived == 1][col].hist(bins=35, color='red', alpha=0.5, \
                                                   label = f'{col} with Survive == 1')
    plt.legend()
    plt.xlabel(col)
    plt.ylabel('Frequent')
    plt.show()

将 SibSp 与 Parch 相加为FamilySize

In [None]:
train_family = []
for i in range(len(train_onehot)):
    train_family.append(train_onehot.loc[i, 'SibSp'] + train_onehot.loc[i, 'Parch'])

df_family = pd.DataFrame(columns = ['FamilySize'], data = train_family)

In [None]:
train_dummies = pd.concat([train_onehot, df_family], axis = 1)

In [None]:
plt.figure(figsize=(10,10))
train_dummies[train_dummies.Survived == 1]['FamilySize'].hist(bins = 35, color = 'red', alpha = 0.5, label = 'Survived = 1')
train_dummies[train_dummies.Survived == 0]['FamilySize'].hist(bins = 35, color = 'blue', alpha = 0.5, label = 'Survived = 0')
plt.legend()
plt.xlabel('FamilySize')
plt.ylabel('freq')

尝试将FamilySize 分组
alone 0
Moderate 1-3
Large > 3

In [None]:
train_dummies['FamilySize'] = train_dummies['FamilySize'].map(lambda x: 'alone' if x == 0 else ('Moderate' if x <= 3 else 'Large'))

In [None]:
train_dummies = pd.get_dummies(data=train_dummies, columns = ['FamilySize'])

In [None]:
train_dummies.head()

对测试集做同样的处理

In [None]:
test_family = []
for i in range(len(test_onehot)):
    test_family.append(test_onehot.loc[i, 'SibSp'] + test_onehot.loc[i, 'Parch'])

df_family_test = pd.DataFrame(columns = ['FamilySize'], data = test_family)

test_dummies = pd.concat([test_onehot, df_family_test], axis = 1)

test_dummies['FamilySize'] = test_dummies['FamilySize'].map(lambda x: 'alone' if x == 0 else ('Moderate' if x <= 3 else 'Large'))

test_dummies = pd.get_dummies(data=test_dummies, columns = ['FamilySize'])

In [None]:
data_train = train_dummies.drop(columns = ['SibSp', 'Parch', 'Title'])
data_test = test_dummies.drop(columns = ['SibSp', 'Parch', 'Title'])

In [None]:
data_test.head()

性别, 票的等级, 票价 一定程度上和Title有挂钩, 所以这里我们为了防止模型臃肿, 将Title丢弃

至此, 我们以及完成了数据预处理步骤, 接下来将模型投入训练.

## 训练
首先将票价以及年龄进行标准化

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
data_train[['Age', 'Fare']] = sc.fit_transform(data_train[['Age', 'Fare']])

In [None]:
data_test[['Age', 'Fare']] = sc.transform(data_test[['Age', 'Fare']])

In [None]:
data_test['Fare'].mean()

### Baseline

In [None]:
from sklearn.linear_model import LogisticRegression
X_train = data_train.drop(columns = ['Survived'])
y_train = data_train['Survived']
X_test = data_test
LR = LogisticRegression()

In [None]:
LR.fit(X_train, y_train)

In [None]:
pred = LR.predict(X_test)

In [None]:
sum((pred == 1)) / len(pred)

In [None]:
Test_passengerId = pd.read_csv('./data/titanic/test.csv')['PassengerId']

In [None]:
result = pd.DataFrame({'PassengerId':Test_passengerId, 'Survived':pred.astype(np.int32)})
result.to_csv("./data/titanic/gender_submission.csv", index=False)

In [None]:
coef = pd.DataFrame(columns = list(X_train.columns), data = LR.coef_)
coef.plot(kind = 'bar', figsize = (12,12))
plt.show()

Baseline取得了0.75119的分数

还不错毕竟只是随便用LR拟合的模型


### 在训练集中, 我们进行交叉验证

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'penalty' : ['l1', 'l2'],
    'C' : [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0],
    'solver' : ['liblinear'],
}

grid = GridSearchCV(LR, params, cv=5)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_score_, grid.best_params_)

In [None]:
LR.set_params(**grid.best_params_)
LR.fit(X_train, y_train)

In [None]:
pred = LR.predict(X_test)
sum((pred == 1)) / len(pred)

In [None]:
result = pd.DataFrame({'PassengerId':Test_passengerId, 'Survived':pred.astype(np.int32)})
result.to_csv("./data/titanic/gender_submission.csv", index=False)

0.75598

略有进步

引入多种分类器 查看效果,

树模型 : 决策树, Adaboost, GBDT, XGBoost, RF

SVM, LR, KNN, GPC, NBC

In [None]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
        self.best = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.best[key] = {'score':gs.best_score_, 'params':gs.best_params_}
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]
    def best_params(self):
        return self.best

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [None]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'GradientBoost' : GradientBoostingClassifier(),
    'RandomForest' : RandomForestClassifier(),
    'XGBoost' : XGBClassifier(),
    'SVC' : SVC(),
    'KNN' : KNeighborsClassifier(),
    'GPC' : GaussianProcessClassifier(),
    'NB' : GaussianNB(),
}

params = {
    'Logistic Regression' : { 'penalty' : ['l1', 'l2'], 'C' : [0.5, 1.0, 2.0, 5.0, 10.0], 'solver' : ['liblinear'],},
    'Decision Tree' : {'criterion' : ['gini', 'entropy'], 'max_depth' : [2, 3, 4, 5], },
    'AdaBoost' : {'n_estimators' : [25, 50, 75,100], 'learning_rate' : [0.5, 1.0], },
    'GradientBoost' : {'n_estimators' : [50, 75,100,125], 'learning_rate' : [0.1, 0.3, 1.0], 'max_depth' : [2, 3, 4, 5]},
    'RandomForest' : {'n_estimators' : [25, 50, 75,100], 'criterion' : ['gini', 'entropy'], 'max_depth' : [2, 3, 4, 5]},
    'XGBoost' : {'n_estimators' : [25, 50,75, 100], 'learning_rate' : [0.1, 0.3, 1.0], 'max_depth' : [2, 3, 4, 5]},
    'SVC' : [ {'kernel' : ['rbf'], 'C' : [1.0, 2.0], 'gamma': [0.001, 0.005]},
             {'kernel' : ['linear'], 'C' : [1.0, 2.0] }],
    'KNN' : {'n_neighbors' : [5, 10, 20, 30]},
    'GPC' : {'n_restarts_optimizer' : [5]},
    'NB' : {},
}

In [None]:
Helper = EstimatorSelectionHelper(models, params)
Helper.fit(X_train, y_train, cv=5)
Helper.score_summary(sort_by='mean_score')

In [None]:
best_params = Helper.best
best_params

丢弃分类器 NB KNN Decision Tree

保留 LR, AdaB, GDBT, XGB, RF, SVC, GPC 共 7 个 分类器

给它们分配上最优超参数

In [None]:
best_models = [
    ('RandomForest',RandomForestClassifier()),
    ('GradientBoost',GradientBoostingClassifier()),
    ('AdaBoost',AdaBoostClassifier()),
    ('XGBoost',XGBClassifier()),
    ('SVC',SVC()),
    ('Logistic Regression',LogisticRegression()),
     ('GPC', GaussianProcessClassifier()),
#     ('Decision Tree', DecisionTreeClassifier()),
#      ('KNN', KNeighborsClassifier()),
]

for item in best_models:
    key, clf = item
    clf.set_params(**best_params[key]['params'])

In [None]:
from sklearn.ensemble import VotingClassifier

vote_hard = VotingClassifier(estimators = best_models , voting = 'hard')

In [None]:
vote_hard.fit(X_train, y_train)
vote_soft.fit(X_train, y_train)
vote_hard_pred = vote_hard.predict(X_test)

In [None]:
sum((vote_hard_pred == 1)) / len(vote_hard_pred)

### 分析分类结果

In [None]:
from sklearn.metrics import classification_report
pred_train = vote_hard.predict(X_train)
cr = classification_report(y_train, pred_train)
print(cr)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train, pred_train)
cm

In [None]:
result = pd.DataFrame({'PassengerId':Test_passengerId, 'Survived':vote_hard_pred.astype(np.int32)})
result.to_csv("./data/titanic/gender_submission.csv", index=False)

结果是0.79904, 有一定的进步

### 绘制学习曲线

In [None]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, 
                        train_sizes=np.linspace(.1, 1., 10), verbose=0, plot=True):

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Number of Train Samples")
        plt.ylabel("Score")
        plt.gca().invert_yaxis()
        plt.grid()
    
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Score in Train Set")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Score in Val Set")
    
        plt.legend(loc="best")
        
        plt.draw()
        plt.gca().invert_yaxis()
    
    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

plot_learning_curve(vote_hard, "Learning Curve", X_train, y_train)

 从学习曲线来看, 结果有一定的偏差, 应该尝试挖掘更多的代表性特征.

### 分析Badcase

In [None]:
tmp_data = train_dummies.drop(columns = ['SibSp', 'Parch', 'Title'])

In [None]:
tmp_data.head()

In [None]:
Badcase = tmp_data[tmp_data.Survived != pred_train]

### Badcase的数据分布

In [None]:
plt.figure(figsize=(20,20))
for i, col in enumerate(Badcase.columns, 1):
    plt.subplot(5, 3, i)
    Badcase[Badcase.Survived == 1][col].hist(bins = 35, color = 'red', label = 'Survived = 1', alpha = 0.5)
    Badcase[Badcase.Survived == 0][col].hist(bins = 35, color = 'blue', label = 'Survived = 0', alpha = 0.5)
    plt.legend()
    plt.xlabel(col)

In [None]:
bad_corr = Badcase.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(bad_corr, annot = True, fmt=".2f", cmap="YlGnBu")

In [None]:
tmp_corr = tmp_data.corr()
plt.figure(figsize=(12,8))
bad_corr['Survived'].plot(kind = 'bar', color = 'red', label = 'Bad_case', alpha = 0.5)
tmp_corr['Survived'].plot(kind = 'bar', color = 'blue', label = 'All_case', alpha = 0.5)
plt.title('Bad Case Vs All Case')
plt.legend()

比较反常的地方是

- Age属性, Age与存活正相关
- Embarked属性很诡异, 感觉可以将其去掉
- Sex_male与Sex_female完全倒了过来, 训练集的特征 对 sex_male不利
- FamilySize也是相反的情况

感觉要构造一些新的特征, 或者要对特征做一下特殊的处理.

首先, 我们将港口数据给去除

然后中年男子也有存活率, 所以我们将年龄分组, 分为青中老

Pclass 基本符合, 故不做处理

有大量得存活男性, 被分错, 一会具体分析下原因.

In [None]:
X_train_v1 = X_train.drop(columns = ['Embarked_C', 'Embarked_Q', 'Embarked_S'])
X_test_v1 = X_test.drop(columns = ['Embarked_C', 'Embarked_Q', 'Embarked_S'])
X_train_v1.head()

### 去掉Embarked特征

结果反而变差

In [None]:
vote_hard.fit(X_train_v1, y_train)
pred_v1 = vote_hard.predict(X_train_v1)
cr = classification_report(y_train, pred_v1)
print(cr)

In [None]:
cm = confusion_matrix(y_pred=pred_v1, y_true=y_train)
cm

### 年龄离散化分组

In [None]:
X_train_v1[['Age', 'Fare']] = sc.inverse_transform(X_train_v1[['Age', 'Fare']])
X_train_v1.head()

In [None]:
Age_map = []
for idx in range(len(X_train_v1)):
    if X_train_v1.loc[idx, 'Age'] <= 15:
        Age_map.append('Child')
    elif X_train_v1.loc[idx, 'Age'] <= 30:
        Age_map.append('Young')
    elif X_train_v1.loc[idx, 'Age'] <= 50:
        Age_map.append('Middle')
    else:
        Age_map.append('Old')

Age_bin = pd.DataFrame(columns = ['Age_bin'], data = Age_map)

X_train_v1 = pd.concat([X_train_v1, Age_bin], axis = 1)

In [None]:
X_train_v1.head()

In [None]:
X_train_v2 = pd.get_dummies(data = X_train_v1.drop(columns = ['Age']), columns = ['Age_bin'])
X_train_v2[['Fare']] = X_train[['Fare']]

In [None]:
X_train_v2.head()

In [None]:
vote_hard.fit(X_train_v2, y_train)
pred_v2 = vote_hard.predict(X_train_v2)
cr = classification_report(y_train, pred_v2)
print(cr)

In [None]:
confusion_matrix(y_true=y_train, y_pred=pred_v2)

相较于第一个结果 变差了不少

### 尝试加入新的特征 Ttile

In [None]:
for title in TitleMap.keys():
    train_dummies[train_dummies.Title == title]['Survived'].value_counts().plot(kind = 'bar')
    plt.xlabel(title)
    plt.show()

仔细看了看 Title, 前面的分析看来是错误的.

即使是相同的性别下, 不同的title存活率也不相同, 这个特征更加的详细, 那么增加这个特征看看效果.

In [None]:
import copy
train_backup = copy.deepcopy(train_dummies)
train_backup['Title'] = train_dummies['Title'].map(lambda x : 'Rare' if TitleMap[x] <= 10 else x)

In [None]:
pd.unique(train_backup['Title'])

In [None]:
train_backup = pd.get_dummies(columns = ['Title'], data = train_backup)

In [None]:
train_backup.head()

对测试集做同样的操作

In [None]:
test_backup = copy.deepcopy(test_dummies)
test_backup['Title'] = test_dummies['Title'].map(lambda x : 'Rare' if TitleMap[x] <= 10 else x)
pd.unique(test_backup['Title'])

In [None]:
test_backup = pd.get_dummies(columns = ['Title'], data = test_backup)

In [None]:
try:
    train_backup = train_backup.drop(columns = ['SibSp', 'Parch'])
    test_backup = test_backup.drop(columns = ['SibSp', 'Parch'])
except Exception as e:
    print(e)

In [None]:
X_trian_backup = train_backup.drop(columns = ['Survived'])

In [None]:
X_trian_backup[['Age', 'Fare']] = sc.fit_transform(X_trian_backup[['Age', 'Fare']])

In [None]:
test_backup[['Age', 'Fare']] = sc.transform(test_backup[['Age', 'Fare']])

In [None]:
vote_hard.fit(X_trian_backup, y_train)
pred_backup = vote_hard.predict(X_trian_backup)
cr = classification_report(y_train, pred_backup)
print(cr)

In [None]:
cm = confusion_matrix(y_pred = pred_backup, y_true = y_train)
cm

In [None]:
vote_hard.fit(X_trian_backup, y_train)
pred_backup = vote_hard.predict(X_trian_backup)
cr = classification_report(y_train, pred_backup)
print(cr)

In [None]:
cm = confusion_matrix(y_pred = pred_backup, y_true = y_train)
cm

In [None]:
models_backup = {
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'GradientBoost' : GradientBoostingClassifier(),
    'RandomForest' : RandomForestClassifier(),
    'XGBoost' : XGBClassifier(),
    'SVC' : SVC(),
    'KNN' : KNeighborsClassifier(),
    'GPC' : GaussianProcessClassifier(),
    'NB' : GaussianNB(),
}

params_backup = {
    'Logistic Regression' : { 'penalty' : ['l1', 'l2'], 'C' : [0.5, 1.0, 2.0, 5.0, 10.0], 'solver' : ['liblinear'],},
    'Decision Tree' : {'criterion' : ['gini', 'entropy'], 'max_depth' : [2,4,6], },
    'AdaBoost' : {'n_estimators' : [25, 50, 75,100], 'learning_rate' : [0.5, 1.0], },
    'GradientBoost' : {'n_estimators' : [25, 50, 75,100], 'learning_rate' : [0.1, 0.3, 1.0], 'max_depth' : [2, 4, 6]},
    'RandomForest' : {'n_estimators' : [25, 50, 75,100], 'criterion' : ['gini', 'entropy'], 'max_depth' : [2, 4, 6]},
    'XGBoost' : {'n_estimators' : [25, 50,75, 100], 'learning_rate' : [0.1, 0.3, 1.0], 'max_depth' : [2, 4, 6]},
    'SVC' : [ {'kernel' : ['rbf'], 'C' : [1.0, 2.0]},
             {'kernel' : ['linear'], 'C' : [1.0, 2.0], 'gamma': [0.001, 0.0001] }],
    'KNN' : {'n_neighbors' : [5, 10, 20, 30]},
    'GPC' : {'n_restarts_optimizer' : [5]},
    'NB' : {},
}

In [None]:
Helper_backup = EstimatorSelectionHelper(models, params)
Helper_backup.fit(X_trian_backup, y_train, cv=5)
Helper_backup.score_summary(sort_by='mean_score')

In [None]:
best_params_backup = Helper_backup.best
best_params_backup

In [None]:
best_models_backup = [
    ('Logistic Regression' , LogisticRegression()),
    ('Decision Tree' , DecisionTreeClassifier()),
    ('AdaBoost' , AdaBoostClassifier()),
    ('GradientBoost' , GradientBoostingClassifier()),
    ('RandomForest' , RandomForestClassifier()),
    ('XGBoost' , XGBClassifier()),
    ('SVC' , SVC()),
    ]


for key, clf in best_models_backup:
    clf.set_params(**best_params_backup[key]['params'])

In [None]:
vote_hard_backup = VotingClassifier(estimators = best_models_backup , voting = 'hard')

In [None]:
vote_hard_backup.fit(X_trian_backup, y_train)
pred_backup = vote_hard_backup.predict(X_trian_backup)
cr = classification_report(y_pred=pred_backup, y_true=y_train)
print(cr)

In [None]:
cm = confusion_matrix(y_pred=pred_backup, y_true=y_train)
cm

In [None]:
pred_backup = vote_hard_backup.predict(test_backup)

In [None]:
sum((pred_backup == 1)) / len(pred_backup)

In [None]:
result = pd.DataFrame({'PassengerId':Test_passengerId, 'Survived':pred_backup.astype(np.int32)})
result.to_csv("./data/titanic/gender_submission.csv", index=False)