In [None]:
# 数据分析
import pandas as pd
import numpy as np

# 机器学习
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import StratifiedKFold,GridSearchCV,train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

#用于模型保存
from sklearn.externals import joblib

#可视化
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

1、数据总览

In [None]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [None]:
train_df.head(5)

In [None]:
#查看各列信息，主要看是否有缺失值
print("训练数据")
train_df.info()
#测试集数据查看
print("测试数据")
test_df.info()

2、数据分析

In [None]:
#数值型数据分析
train_df.describe()

In [None]:
train_df['Survived'].value_counts()

In [None]:
# 相关性协方差表,corr()函数,返回结果接近0说明无相关性,大于0说明是正相关,小于0是负相关.
train_corr = train_df.drop('PassengerId',axis=1).corr()
train_corr

In [None]:
#画热力相关图
a = plt.subplots(figsize=(15,9))
a = sns.heatmap(train_corr,vmin=-1,vmax=1,annot=True,square=True)

In [None]:
#分析乘客等级和生存下来的关系
train_df.groupby(['Pclass'])['Pclass','Survived'].mean()

In [None]:
#分析乘客等级和生存下来的关系
train_df[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar(color=['r','g','b'])

In [None]:
#单独分析性别与生存下来的关系
train_df.groupby(['Sex'])['Sex','Survived'].mean()

In [None]:
#单独分析性别与生存下来的关系
train_df[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()

In [None]:
#性别和船舱等级与生存的关系
train_df[['Sex','Pclass','Survived']].groupby(['Pclass','Sex']).mean().plot.bar()

In [None]:
#性别和船舱等级与生存的关系
train_df.groupby(['Sex','Pclass','Survived'])['Survived'].count()

In [None]:
#分析兄弟配偶数量和生存下来的关系
train_df.groupby(['SibSp'])[['SibSp','Survived']].mean()

In [None]:
#分析兄弟配偶数量和生存下来的关系
train_df[['SibSp','Survived']].groupby(['SibSp']).mean().plot.bar()

In [None]:
#分析父母子女数量和生存下来的关系
train_df.groupby('Parch')[['Parch','Survived']].mean()

In [None]:
#分析父母子女数量和生存下来的关系
train_df[['Parch','Survived']].groupby(['Parch']).mean().plot.bar()

In [None]:
#分析年龄和生存下来的关系，年龄有缺失值
g = sns.FacetGrid(train_df,col='Survived',size=5)
g.map(plt.hist,'Age',bins=40)

In [None]:
train_df.groupby(['Age'])['Survived'].mean().plot()

In [None]:
#登陆港口和生存的关系
sns.countplot('Embarked',hue='Survived',data=train_df)

3、特征工程

In [None]:
#先将数据集合并,一起做特征工程(注意,标准化的时候需要分开处理)
#先将test补齐,然后通过pd.apped()合并
test_df['Survived'] = 0
train_test = train_df.append(test_df)

In [None]:
train_test.shape
train_test.head()

In [None]:
#利用get_dummies函数进行Pclass独热处理
train_test = pd.get_dummies(train_test,columns=['Pclass'])
train_test.head()

In [None]:
#对sex分裂处理
train_test = pd.get_dummies(train_test,columns=["Sex"])
train_test.head()

In [None]:
train_test.head()

In [None]:
#对兄弟姐妹以及父母子女进行处理
train_test['SibSp_Parch'] = train_test['SibSp']+train_test['Parch']
train_test = pd.get_dummies(train_test,columns=['SibSp','Parch','SibSp_Parch'])
train_test.head()

In [None]:
#train_test.info()
#对Embarked进行处理
train_test = pd.get_dummies(train_test,columns=["Embarked"])
train_test.head()

In [None]:
#对Name进行处理
train_test['Name1']=train_test['Name'].str.extract('.+,(.+)',expand=False).str.extract('^(.+?)\.',expand=False).str.strip()
#train_test['Name1']

In [None]:
#将姓名分类处理()
train_test['Name1'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer' , inplace = True)
train_test['Name1'].replace(['Jonkheer', 'Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty' , inplace = True)
train_test['Name1'].replace(['Mme', 'Ms', 'Mrs'], 'Mrs')
train_test['Name1'].replace(['Mlle', 'Miss'], 'Miss')
train_test['Name1'].replace(['Mr'], 'Mr' , inplace = True)
train_test['Name1'].replace(['Master'], 'Master' , inplace = True)
#train_test['Name1']

In [None]:
train_test=pd.get_dummies(train_test,columns=["Name1"])
#train_test.head()

In [None]:
#从姓名中提取出姓
train_test['Name2'] = train_test['Name'].apply(lambda x: x.split('.')[1])

# #计算数量,然后合并数据集
Name2_sum = train_test['Name2'].value_counts().reset_index()
Name2_sum.columns=['Name2','Name2_sum']
train_test = pd.merge(train_test,Name2_sum,how='left',on='Name2')

#由于出现一次时该特征时无效特征,用one来代替出现一次的姓
train_test.loc[train_test['Name2_sum'] == 1 , 'Name2_new'] = 'one'
train_test.loc[train_test['Name2_sum'] > 1 , 'Name2_new'] = train_test['Name2']
del train_test['Name2']

# #分列处理
train_test = pd.get_dummies(train_test,columns=['Name2_new'])
#删掉姓名这个特征
del train_test['Name']

#train_test.head()

In [None]:
#处理Fare变量
train_test.loc[train_test["Fare"].isnull()]

In [None]:
#处理Fare变量，由于变量和Pclass以及Embarked有关
train_df.groupby(by=["Pclass","Embarked"]).Fare.mean()

In [None]:
#用pclass=3和Embarked=S的平均数14.644083来替换
train_test['Fare'].fillna(14.644083,inplace=True)
train_test.loc(train_test['Fare'].isnull())

In [None]:
#将Ticket提取字符列
#str.isnumeric()  如果S中只有数字字符，则返回True，否则返回False
train_test['Ticket']
train_test['Ticket_Letter'] = train_test['Ticket'].str.split().str[0]

In [None]:
train_test['Ticket_Letter'] = train_test['Ticket_Letter'].apply(lambda x:np.nan if x.isnumeric() else x)
train_test.drop('Ticket',inplace=True,axis=1)

In [None]:
#分列,此时nan值可以不做处理
train_test = pd.get_dummies(train_test,columns=['Ticket_Letter'],drop_first=True)
#train_test.head()

In [None]:
#Age
train_test.loc[train_test['Age'].isnull()]

In [None]:
"""这是模型就好后回来增加的新特征
考虑年龄缺失值可能影响死亡情况,数据表明,年龄缺失的死亡率为0.19."""
train_test.loc[train_test["Age"].isnull()]['Survived'].mean()

In [None]:
#利用年龄是否缺失来构造新特征
train_test.loc[train_test["Age"].isnull(),"age_nan"]=1
train_test.loc[train_test["Age"].notnull(),"age_nan"]=0
train_test=pd.get_dummies(train_test,columns=["age_nan"])

In [None]:
#创建没有["Age"，"Survived"]的数据集
missing_age = train_test.drop(['Survived','Cabin'],axis=1)
#将Age完整的项作为训练集、将Age缺失的项作为测试集。
missing_age_train = missing_age[missing_age['Age'].notnull()]
missing_age_test = missing_age[missing_age['Age'].isnull()]

In [None]:
#构建训练集合预测集的X和Y值
missing_age_X_train = missing_age_train.drop(['Age'], axis=1)
missing_age_Y_train = missing_age_train['Age']
missing_age_X_test = missing_age_test.drop(['Age'], axis=1)

In [None]:
# 先将数据标准化
ss = StandardScaler()
#用测试集训练并标准化
ss.fit(missing_age_X_train)
missing_age_X_train = ss.transform(missing_age_X_train)
missing_age_X_test = ss.transform(missing_age_X_test)

In [None]:
#使用贝叶斯预测年龄
lin = BayesianRidge()

In [None]:
lin.fit(missing_age_X_train,missing_age_Y_train)

In [None]:
#利用loc将预测值填入数据集
train_test.loc[(train_test['Age'].isnull()), 'Age'] = lin.predict(missing_age_X_test)

In [None]:
#将年龄划分是个阶段10以下,10-18,18-30,30-50,50以上
train_test['Age'] = pd.cut(train_test['Age'], bins=[0,10,18,30,50,100],labels=[1,2,3,4,5])

train_test = pd.get_dummies(train_test,columns=['Age'])

In [None]:
train_test.head()

In [None]:
#cabin项缺失太多，只能将有无Cain首字母进行分类,缺失值为一类,作为特征值进行建模
train_test['Cabin_nan'] = train_test['Cabin'].apply(lambda x:str(x)[0] if pd.notnull(x) else x)
train_test = pd.get_dummies(train_test,columns=['Cabin_nan'])

In [None]:
 #cabin项缺失太多，只能将有无Cain首字母进行分类,
train_test.loc[train_test["Cabin"].isnull() ,"Cabin_nan"] = 1
train_test.loc[train_test["Cabin"].notnull() ,"Cabin_nan"] = 0
train_test = pd.get_dummies(train_test,columns=['Cabin_nan'])
train_test.drop('Cabin',axis=1,inplace=True)

In [None]:
train_test.head()

In [None]:
#特征工程完毕，划分数据集
train_data = train_test[:891]
test_data = train_test[891:]
train_data_X = train_data.drop(['Survived'],axis=1)
train_data_Y = train_data['Survived']
test_data_X = test_data.drop(['Survived'],axis=1)

In [None]:
print(train_data_X.shape)

In [None]:
# 标准化
ss2 = StandardScaler()
ss2.fit(train_data_X)
train_data_X_sd = ss2.transform(train_data_X)
test_data_X_sd = ss2.transform(test_data_X)

4、建立基模型

In [None]:
#Logistic

#划分训练集和测试集
train_data,dev_data,train_label,dev_label = train_test_split(train_data_X_sd,train_data_Y,test_size=0.1,random_state=34)

lr1=LogisticRegression()
param={'C':[0.001,0.01,0.1,1,10],"max_iter":[100,250]}
clf = GridSearchCV(lr1,param,cv=5)
clf.fit(train_data,train_label)

#打印结果
print(clf.grid_scores_)
print(clf.best_params_)

#将最佳参数输入模型
lr = LogisticRegression(C=0.01,max_iter=100)
lr.fit(train_data, train_label)

print(lr.score(dev_data,dev_label))
# 输出结果
test_df["Survived"] = lr.predict(test_data_X_sd)
test_df[["PassengerId","Survived"]].set_index("PassengerId").to_csv('../out/LR.csv')

In [None]:
# 随机森林
rf = RandomForestClassifier(n_estimators=150,min_samples_leaf=3,max_depth=6,oob_score=True)
rf.fit(train_data_X,train_data_Y)
print(rf.oob_score_)
test_df["Survived"] = rf.predict(test_data_X)
RF = test_df[['PassengerId','Survived']].set_index('PassengerId').to_csv('../out/RF.csv')

# # 保存模型
# from sklearn.externals import joblib
# joblib.dump(rf, '../model/rf10.pkl')

In [None]:
#svm算法
svc = SVC()
svc.fit(train_data,train_label)
print(svc.score(dev_data,dev_label))
#预测保存数据
test_df["Survived"]=svc.predict(test_data_X_sd)
test_df[["PassengerId","Survived"]].set_index("PassengerId").to_csv('../out/SVM.csv')

In [None]:
#GBDT

gbdt = GradientBoostingClassifier(learning_rate=0.7,max_depth=6,n_estimators=100,min_samples_leaf=2)
gbdt.fit(train_data_X,train_data_Y)

test_df["Survived"] = gbdt.predict(test_data_X)
test_df[['PassengerId','Survived']].set_index('PassengerId').to_csv('../out/GBDT.csv')

In [None]:
#XGBoost

xgb_model = xgb.XGBClassifier(n_estimators=150,min_samples_leaf=3,max_depth=6)
xgb_model.fit(train_data_X,train_data_Y)

test_df["Survived"] = xgb_model.predict(test_data_X)
test_df[['PassengerId','Survived']].set_index('PassengerId').to_csv('../out/XGB.csv')

5、模型融合

In [None]:
#模型Voting

lr = LogisticRegression(C=0.1,max_iter=100)
xgb_model = xgb.XGBClassifier(max_depth=6,min_samples_leaf=2,n_estimators=100,num_round = 5)
rf = RandomForestClassifier(n_estimators=200,min_samples_leaf=2,max_depth=6,oob_score=True)
gbdt = GradientBoostingClassifier(learning_rate=0.1,min_samples_leaf=2,max_depth=6,n_estimators=100)

vot = VotingClassifier(estimators=[('lr', lr), ('rf', rf),('gbdt',gbdt),('xgb',xgb_model)], voting='hard')
vot.fit(train_data_X_sd,train_data_Y)

test_df["Survived"] = vot.predict(test_data_X_sd)
test_df[['PassengerId','Survived']].set_index('PassengerId').to_csv('../out/VOT.csv')

In [None]:
#模型stacking
# 划分train数据集,调用代码,把数据集名字转成和代码一样
X = train_data_X_sd
X_predict = test_data_X_sd
y = train_data_Y

'''模型融合中使用到的各个单模型'''
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

clfs = [LogisticRegression(C=0.1,max_iter=100),
        xgb.XGBClassifier(max_depth=6,n_estimators=100,num_round = 5),
        RandomForestClassifier(n_estimators=100,max_depth=6,oob_score=True),
        GradientBoostingClassifier(learning_rate=0.3,max_depth=6,n_estimators=100)]

# 创建n_folds
from sklearn.model_selection import StratifiedKFold
n_folds = 5
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X,y)
# 创建零矩阵
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))

# 建立模型
for j, clf in enumerate(clfs):
    '''依次训练各个单模型'''
    # print(j, clf)
    dataset_blend_test_j = np.zeros((X_predict.shape[0], skf.get_n_splits(X,y)))
    i=0;
    for train_index, test_index in skf.split(X,y):
        '''使用第i个部分作为预测，剩余的部分来训练模型，获得其预测的输出作为第i部分的新特征。'''
        X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test_index, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
        i+=1
    '''对于测试集，直接用这k个模型的预测值均值作为新的特征。'''
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

# 用建立第二层模型
clf2 = LogisticRegression(C=0.1,max_iter=100)
clf2.fit(dataset_blend_train, y)
# y_submission = clf2.predict_proba(dataset_blend_test)[:, 1]


test_df["Survived"] = clf2.predict(dataset_blend_test)
test_df[['PassengerId','Survived']].set_index('PassengerId').to_csv('../out/Stack.csv')