In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 1000)
pd.set_option('max_colwidth',1000)

In [None]:
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")
submitData = pd.read_csv("gender_submission.csv")
y_train = trainData['Survived']
trainData.drop("Survived", axis=1, inplace=True)

In [None]:
allData = pd.concat([trainData, testData], axis=0, ignore_index=True)

In [None]:
allData.info()

In [None]:
allData = pd.get_dummies(allData, columns=['Pclass'])

In [None]:
allData = pd.get_dummies(allData, columns=['Sex'])

In [None]:
allData['SibSp_Parch'] = allData['SibSp'] + allData['Parch']

In [None]:
allData = pd.get_dummies(allData, columns=['SibSp', 'Parch', 'SibSp_Parch'])

In [None]:
allData = pd.get_dummies(allData, columns=['Embarked'])

In [None]:
allData['NAME1'] = allData['Name'].str.extract('.+,(.+)').str.extract('^(.+?)\.').str.strip()

In [None]:
allData.head()

In [None]:
#将姓名分类处理()
allData['NAME1'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer' , inplace = True)
allData['NAME1'].replace(['Jonkheer', 'Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty' , inplace = True)
allData['NAME1'].replace(['Mme', 'Ms', 'Mrs'], 'Mrs')
allData['NAME1'].replace(['Mlle', 'Miss'], 'Miss')
allData['NAME1'].replace(['Mr'], 'Mr' , inplace = True)
allData['NAME1'].replace(['Master'], 'Master' , inplace = True)

In [None]:
allData = pd.get_dummies(allData, columns=['NAME1'])

In [None]:
#从姓名中提取出姓
allData['NAME2'] = allData['Name'].apply(lambda x: x.split('.')[1])

In [None]:
# 计算数量,然后合并数据集
Name2_sum = allData['NAME2'].value_counts().reset_index()
Name2_sum.columns=['NAME2', 'Name2_sum']
allData = pd.merge(allData, Name2_sum, how='left', on='NAME2')



allData.loc[allData['Name2_sum'] == 1, 'Name2_new'] = 'one'
allData.loc[allData['Name2_sum'] > 1, 'Name2_new'] = allData['NAME2']
allData.drop('NAME2', axis=1, inplace=True)

allData = pd.get_dummies(allData, columns=['Name2_new'])

In [None]:
allData.drop('Name', axis=1, inplace=True)

In [None]:
# 从上面的分析,发现该特征train集无miss值,test有一个缺失值,先查看
allData.loc[allData['Fare'].isnull()]

In [None]:
# 票价与pclass和Embarked有关,所以用train分组后的平均数填充
trainData.groupby(by=['Pclass', 'Embarked']).Fare.mean()

In [None]:
# 用pclass=3和Embarked=S的平均数14.644083来填充
allData["Fare"].fillna(14.644083, inplace=True)

In [None]:
allData.head()

In [None]:
# Ticket提取字符列
allData['Ticket_Letter'] = allData['Ticket'].str.split().str[0]
# str.isnumeric()  如果S中只有数字字符，则返回True，否则返回False
allData['Ticket_Letter'] = allData['Ticket_Letter'].apply(lambda x:np.nan if x.isnumeric() else x)
allData.drop('Ticket', axis=1, inplace=True)

In [None]:
allData = pd.get_dummies(allData, columns=['Ticket_Letter'], drop_first=True)

In [None]:
# 使用年龄是否缺失来构造特征
allData.loc[allData['Age'].isnull(), 'age_nan'] = 1
allData.loc[allData['Age'].notnull(), 'age_nan'] = 0
allData = pd.get_dummies(allData, columns=['age_nan'])

In [None]:
allData.info()

### 通过建立模型预测缺失的Age特征

In [None]:
# 不要Cabin是因为还没处理，先去掉
missing_age = allData.drop(['Cabin'], axis=1)
# 将Age完整的项作为训练集、将Age缺失的项作为测试集
missing_age_train = missing_age[missing_age['Age'].notnull()]
missing_age_test = missing_age[missing_age['Age'].isnull()]

In [None]:
# 分离X ,y
missing_age_X_train = missing_age_train.drop('Age', axis=1)
missing_age_Y_train = missing_age_train['Age']
missing_age_X_test = missing_age_test.drop('Age', axis=1)

In [None]:
# 标准化数据
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(missing_age_X_train)
missing_age_X_train = ss.transform(missing_age_X_train)
missing_age_test = ss.transform(missing_age_X_test)

In [None]:
# 使用贝叶斯岭回归预测年龄
from sklearn.linear_model import BayesianRidge
bayes = BayesianRidge()
bayes.fit(missing_age_X_train, missing_age_Y_train)

In [None]:
# 用预测值填充
allData.loc[(allData['Age'].isnull()), 'Age'] = bayes.predict(missing_age_X_test)

In [None]:
# 数据离散化，将年龄划分四个阶段10以下,10-18,18-30,30-50,50以上
allData['Age'] = pd.cut(allData['Age'], bins=[0, 10, 18, 30, 50, 100], labels=[1, 2, 3, 4, 5])
# 分列处理
allData = pd.get_dummies(allData, columns=['Age'])

In [None]:
# Cabin项缺失太多，根据是否缺失给予标记
allData.loc[allData['Cabin'].isnull(), 'Cabin_nan'] = 1
allData.loc[allData['Cabin'].notnull(), 'Cabin_nan'] = 0
# 分列处理
allData = pd.get_dummies(allData, columns=['Cabin_nan'])
allData.drop('Cabin', axis=1, inplace=True)

In [None]:
len(y_train)

In [None]:
# 分离数据
trainData = allData[:891]
testData = allData[891:]

> 线性模型需要标准化后的数据建模，而树类模型不需要标准化的数据
  在处理时，注意要将训练集的数据transform到test集上

In [None]:
from sklearn.preprocessing import StandardScaler
ss2 = StandardScaler()
ss2.fit(trainData)
trainData_sd = ss2.transform(trainData)
testData_sd = ss2.transform(testData)

# XGBOOST 生成新特征

In [None]:
temp_train = trainData.copy()
temp_test = testData.copy()

In [None]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(booster='gbtree', 
                    learning_rate =0.1,
                    objective='binary:logitraw',   # binary:logistic
                    gamma=0.05, 
                    subsample=0.4, 
                    reg_alpha=1e-05,
                    n_estimators=50,
                    metric=['auc', 'l2'],
                    colsample_bytree=0.7, 
                    silent=1, 
                    nthread=4)

xgb.fit(temp_train.values, y_train)
new_feature= xgb.apply(temp_train.values)
trainData = np.hstack((temp_train, new_feature))

new_feature_test = xgb.apply(temp_test.values)
testData = np.hstack((temp_test, new_feature_test))

In [None]:
import lightgbm as lgb
gbm = lgb.LGBMClassifier(learning_rate=0.1, 
                   boosting_type='gbdt', 
                   objective='binary',
                   n_estimators=100,
                   metric=['auc', 'l2'], 
                   max_depth=7, 
                   bagging_fraction=0.7, 
                   is_unbalance=True)

gbm.fit(temp_train.values, y_train)
new_feature = gbm.apply(temp_train.values)
trainData = np.hstack((trainData, new_feature))

new_feature_test = gbm.apply(temp_test.values)
testData = np.hstack((testData, new_feature_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, 
                            min_samples_split=90, 
                            min_samples_leaf=3,
                            max_depth=8,
                            oob_score=True,
                            criterion='gini')

rf.fit(temp_train.values, y_train)
new_feature = rf.apply(temp_train.values)
trainData = np.hstack((trainData, new_feature))

new_feature_test = rf.apply(temp_test.values)
testData = np.hstack((testData, new_feature_test))

# Voting

In [None]:
from sklearn.ensemble import VotingClassifier

from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression(C=0.1, max_iter=1000)
lr2 = LogisticRegression(C=0.01, max_iter=1000)

import xgboost as xgb
xgb_model1 = xgb.XGBClassifier(max_depth=6, min_samples_leaf=3, n_estimators=2000, metric=['l2', 'auc'])

from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier(n_estimators=200,min_samples_leaf=3,max_depth=8,oob_score=True, criterion='gini')
rf2 = RandomForestClassifier(n_estimators=200,min_samples_leaf=3,max_depth=8,oob_score=True, criterion='entropy')


from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.01,min_samples_leaf=2,max_depth=6,n_estimators=200)

import lightgbm as lgb
gbm = lgb.LGBMClassifier(boosting_type='gbdt',
                         objective='binary',
                         metric=['l2', 'auc'],
                         num_leaves=100, 
                         min_data_in_leaf=100,
                         learning_rate=0.02,
                         bagging_fraction=0.8,
                         bagging_freq=5,
                         lambda_l1=0.4,
                         lambda_l2=0.6, 
                         max_depth=6,
                         is_unbalance=True)

vot = VotingClassifier(estimators=[('lr1', lr1), ('lr2', lr2), ('rf1', rf1), ('rf2', rf2), 
                                   ('gbdt',gbdt), ('xgb1',xgb_model1), ('lgb', gbm)], voting='soft')
vot.fit(trainData, y_train)

In [None]:
submitData["Survived"] = vot.predict(testData)
submitData.to_csv('voting.csv', index=False)

# Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(base_estimator=vot, n_estimators=3)
bagging.fit(trainData, y_train)

In [None]:
submitData["Survived"] = bagging.predict(testData)
submitData.to_csv('bagging.csv', index=False)

# LightGBM

In [None]:
import lightgbm as lgb
lgb_train = lgb.Dataset(trainData, y_train)  
lgb_eval = lgb.Dataset(trainData[600:], y_train[600:], reference=lgb_train)  
# specify your configurations as a dict  
params = {  
    'boosting_type': 'gbdt',  
    'objective': 'binary',  
    'metric': ['auc', 'l2'],  # 'map@2', 
    'num_leaves': 100, # 4
    'min_data_in_leaf': 100,
    'learning_rate': 0.02,  
#     'feature_fraction': 0.3,  
    'bagging_fraction': 0.8,  
    'bagging_freq': 5,  
    'lambda_l1': 0.4,  
    'lambda_l2': 0.6,
    'max_depth':6,
#     'min_gain_to_split': 0.2,  
    'verbose': 5,  
    'is_unbalance': True
}  
  
print('Start training...')  
gbm = lgb.train(params,  
                lgb_train,  
                num_boost_round=8000,  
                valid_sets=lgb_eval,  
                early_stopping_rounds=500)  

In [None]:
pb = gbm.predict(testData, num_iteration=gbm.best_iteration)

In [None]:
submitData["Survived"].loc[pb > 0.5] = 1

In [None]:
submitData["Survived"].loc[pb < 0.5] = 0

In [None]:
submitData.to_csv('lgb.csv', index=False)

# NN

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(y_train.reshape(-1, 1))
y_hot = enc.transform(y_train.reshape(-1, 1))


#构建LM神经网络模型
from keras.models import Sequential #导入神经网络初始化函数
from keras.layers.core import Dense, Activation #导入神经网络层函数、激活函数
from keras.layers import Dropout
from keras.metrics import top_k_categorical_accuracy
from keras.callbacks import EarlyStopping
netfile = 'net.model' #构建的神经网络模型存储路径

def acc_top2(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=2)

net = Sequential()
net.add(Dense(input_dim = 241, output_dim = 256))
net.add(Activation('relu'))
net.add(Dense(input_dim = 256, output_dim = 512))
net.add(Activation('relu'))
net.add(Dense(input_dim = 512, output_dim = 1024))
net.add(Activation('relu'))
net.add(Dropout(0.3))
net.add(Dense(input_dim = 1024, output_dim = 1024))
net.add(Activation('relu'))
net.add(Dropout(0.3))
net.add(Dense(input_dim = 1024, output_dim = 1024))
net.add(Dense(input_dim = 1024, output_dim = 2))
net.add(Activation('softmax'))
net.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics=['accuracy']) # accuracy
early_stopping = EarlyStopping(monitor='val_loss', patience=500, verbose=10)

net.fit(trainData_sd, y_hot, epochs=8000, batch_size=64, validation_data=(trainData_sd, y_hot), callbacks=[early_stopping])
net.save_weights(netfile) #保存模型

In [None]:
pred = net.predict(testData_sd)

In [None]:
pred = pred.argsort()[np.arange(len(pred)), -1]

In [None]:
submitData["Survived"] = pred
submitData.to_csv('nn.csv', index=False)

# Blend

In [None]:
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb

def blend(X, y, X_submission, n_folds):
    skf = list(StratifiedKFold(y, n_folds))

    clfs = [RandomForestClassifier(n_estimators=200,min_samples_leaf=3,max_depth=8,oob_score=True, criterion='gini'),
            RandomForestClassifier(n_estimators=200,min_samples_leaf=3,max_depth=8,oob_score=True, criterion='entropy'),
            GradientBoostingClassifier(learning_rate=0.01,min_samples_leaf=2,max_depth=6,n_estimators=200),
            XGBClassifier(learning_rate =0.05, n_estimators=300, max_depth=6, min_samples_leaf=3, num_round=2000),
            KNeighborsClassifier(n_neighbors=5, weights='uniform', p=1)]
    
    
    
    
    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))

    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print (j, clf)
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print ("Fold", i)
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    print("Blending.")
    
    from sklearn.linear_model import LogisticRegression
    lr1 = LogisticRegression(C=0.1, max_iter=1000)
    lr2 = LogisticRegression(C=0.01, max_iter=1000)
    
    from sklearn.ensemble import VotingClassifier
    clf = VotingClassifier(estimators=[('lr1', lr1), ('lr2', lr2)], voting='hard')
    clf.fit(dataset_blend_train, y)
#     y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
#     y_submission = clf.predict(dataset_blend_test)
    return clf.predict(dataset_blend_test)

In [None]:
pred = blend(trainData, y_train, testData, 5)

In [None]:
submitData["Survived"] = pred
submitData.to_csv('blend.csv', index=False)