- 对Titanic数据进行清洗，建模并对乘客生存进行预测。使用之前介绍过的10种模型中的至少2种（包括TPOT）
-  https://www.kaggle.com/c/titanic/data

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier


In [3]:
train_data = pd.read_csv(r'E:\bi_course\biCourse\L3\sat_course\train.csv')
test_data =  pd.read_csv(r'E:\bi_course\biCourse\L3\sat_course\test.csv')

#数据探索
print('训练数据行列数:',train_data.shape, '; 测试数据行列数:',test_data.shape)

print('-'*20,'训练集数据探索','-'*20)
train_data.info()
print('-'*40,'-'*40)
print(train_data.describe())

print('-'*20,'测试集数据探索','-'*20)
test_data.info()
print('-'*40,'-'*40)
print(test_data.describe())

训练数据行列数: (891, 12) ; 测试数据行列数: (418, 11)
-------------------- 训练集数据探索 --------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
---------------------------------------- ----------------------------------------
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592   

In [4]:
#提取title
def extract_title(name: str):
    title = name.split(', ')[1].split('.')[0]
    return title

#将titel分为四类: 'Mr','Ms','Miss','Honored'
def unify_title(title: str):

    if title in ['Dr','Rev','Major','Col','Lady','Jonkheer','Sir','the Countess','Capt','Don','Dona']: 
        return  'Honored'
    elif title in ['Mlle','Ms']:
        return 'Miss'
    elif title == 'Mme':
        return  'Mrs'
    elif title == 'Master': #不确定master是否应该分为mr
        return  'Mr'
    else :
        return title

#用某一列(groupCol)的分类平均值计算填充的缺失值
def fillna_usingMean(df,groupCol, naCol):
    groupList = list(set(df[groupCol]))
    df_filled_na = pd.DataFrame()
    for group in groupList:
        df_select = df [df[groupCol] == group]
        df_select[naCol].fillna(df_select[naCol].mean(), inplace = True)
        df_filled_na = df_filled_na.append(df_select)

    df = df_filled_na

    return df


#训练不同模型并输出分数报告
def get_model_score(clf, X_train, y_train, X_test, y_test):

    clf.fit(X_train, y_train)
    y_predit = clf.predict(X_test)

    # 模型表现
    report = pd.Series()
    report['clfName'] = str(clf).split('(')[0]
    report['score']  = round(clf.score(X_train, y_train), 4)
    report['cv_score'] = round(np.mean(cross_val_score(clf, X_train, y_train, cv=10)),4)
    report['auc_score'] = round(roc_auc_score(y_test, y_predit),4)
    report['clf'] = clf
    report = pd.DataFrame(report).T

    return report

In [9]:
#数据预处理 

#整合train & test, 对数据进行预处理
train_data['type'] = 'train'
test_data['type'] = 'test'
df = pd.concat([train_data, test_data], axis = 0)


#需要处理的字段: Name(姓名Title混合), age(缺失),Cabin(缺失),Embarked(缺失)
#家庭人数: 配偶, 子女, 亲属 计算登船家庭成员数

df['newTitle']= df['Name'].apply(lambda x: unify_title(extract_title(x))) #提取title ; 姓和名不统一, 暂时不使用该字段作为特征

#fillna
df = fillna_usingMean(df,'newTitle', 'Age')  #用对应title的平均值进行age填充:
df = fillna_usingMean(df,'Pclass', 'Fare') #用对应Pclass的平均值进行fare填充

#drop掉缺失严重的Cabin, Name两列, drop掉有两个缺失值的Embarked所在行
df = df.drop(columns={'Cabin','Name'}).dropna(subset=['Embarked'])

#新增衍生特征
df['familyNum'] = df['Parch'] + df['SibSp'] + 1 #包含本人的家庭成员数

ticket_groupby = df.groupby('Ticket', as_index = False)['PassengerId'].count().rename(columns = {'PassengerId':'PassNumWithSameTicket'})
df = df.merge(ticket_groupby, on = 'Ticket')  #同一个ticket的人数


df['age_mp_Fare'] = df['Age'] * df['Fare']
df['age_mp_Pclass'] = df['Age'] * df['Pclass']

print('数据处理后特征缺失值总数:',sum(df.drop(columns ='Survived').isna().sum())) #排除Survived列

数据处理后特征缺失值总数:0


In [11]:
df

Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,type,newTitle,familyNum,PassNumWithSameTicket,age_mp_Fare,age_mp_Pclass
0,58.000000,S,26.5500,0,12,1,female,0,1.0,113783,train,Miss,1,1,1539.90000,58.000000
1,23.000000,S,263.0000,2,89,1,female,3,1.0,19950,train,Miss,6,6,6049.00000,23.000000
2,24.000000,S,263.0000,2,342,1,female,3,1.0,19950,train,Miss,6,6,6312.00000,24.000000
3,28.000000,S,263.0000,2,945,1,female,3,,19950,test,Miss,6,6,7364.00000,28.000000
4,60.000000,S,263.0000,4,961,1,female,1,,19950,test,Mrs,6,6,15780.00000,60.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1302,22.000000,S,7.7750,0,1290,3,male,0,,347065,test,Mr,1,1,171.05000,66.000000
1303,31.000000,Q,7.7333,0,1291,3,male,0,,21332,test,Mr,1,1,239.73230,93.000000
1304,30.014322,S,8.0500,0,1305,3,male,0,,A.5. 3236,test,Mr,1,1,241.61529,90.042965
1305,38.500000,S,7.2500,0,1307,3,male,0,,SOTON/O.Q. 3101262,test,Mr,1,1,279.12500,115.500000


In [7]:
#模型数据准备

#创建x_train, y_train, x_test

df_toTrain = df[df['type'] == 'train']
df_toPredict = df[df['type'] == 'test'] #训练模型后预测

features = ['Age', 'Embarked','Fare','Pclass','Sex','SibSp','newTitle','familyNum','PassNumWithSameTicket','age_mp_Fare']

X_train,X_test, y_train,y_test = train_test_split(df_toTrain[features], df_toTrain['Survived'], train_size = 0.7, random_state =1)   #模型训练集和测试集
x_toPredict = df_toPredict[features] #样本外需要预测的数据

#转化成oneHot Coding
X_train = pd.get_dummies(X_train) 
X_test = pd.get_dummies(X_test)
x_toPredict = pd.get_dummies(x_toPredict) 

#数据规范化
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
x_toPredict = scaler.fit_transform(x_toPredict)

#训练不同模型并得到评估结果

clfList = [DecisionTreeClassifier(), LogisticRegression(), GaussianNB(),MultinomialNB(),BernoulliNB(),svm.SVC(), ExtraTreesClassifier(),\
            RandomForestClassifier(), KNeighborsClassifier(), AdaBoostClassifier(),GradientBoostingClassifier(), XGBClassifier(), LinearDiscriminantAnalysis(),]

score_report = pd.DataFrame()

for clf in clfList:

    result = get_model_score(clf, X_train, y_train, X_test, y_test)
    score_report = score_report.append(result)

# score_report
score_report = score_report.sort_values(by = 'auc_score', ascending = False)
score_report

Unnamed: 0,clfName,score,cv_score,auc_score,clf
0,LinearDiscriminantAnalysis,0.791,0.7701,0.8321,"LinearDiscriminantAnalysis(n_components=None, ..."
0,LogisticRegression,0.7974,0.7974,0.8261,"LogisticRegression(C=1.0, class_weight=None, d..."
0,SVC,0.7701,0.7668,0.8181,"SVC(C=1.0, cache_size=200, class_weight=None, ..."
0,MultinomialNB,0.7717,0.7701,0.8171,"MultinomialNB(alpha=1.0, class_prior=None, fit..."
0,KNeighborsClassifier,0.8424,0.7798,0.8161,"KNeighborsClassifier(algorithm='auto', leaf_si..."
0,BernoulliNB,0.7701,0.7685,0.8091,"BernoulliNB(alpha=1.0, binarize=0.0, class_pri..."
0,GradientBoostingClassifier,0.9228,0.7974,0.7931,([DecisionTreeRegressor(criterion='friedman_ms...
0,GaussianNB,0.7717,0.7716,0.7892,"GaussianNB(priors=None, var_smoothing=1e-09)"
0,AdaBoostClassifier,0.8521,0.7942,0.7721,"(DecisionTreeClassifier(class_weight=None, cri..."
0,ExtraTreesClassifier,0.9904,0.7734,0.7701,"(ExtraTreeClassifier(class_weight=None, criter..."


In [45]:
#拿最优模型预测df_toPredict(即test数据集)

bestModel = score_report.iloc[0]['clf']
bestModel.fit(X_train, y_train)

print('bestModel:', bestModel)

sumbmission = pd.DataFrame()
sumbmission['PassengerId'] = df_toPredict['PassengerId']
sumbmission['Survived'] = bestModel.predict(x_toPredict).astype(int) #用最优模型对y值进行预测
sumbmission = sumbmission.set_index('PassengerId').sort_index()

sumbmission.to_csv('zoni_submission.csv', header = True)

print('submission to csv done')

bestModel: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
submission to csv done


In [8]:
#使用TPOT 选择最优模型
from tpot import TPOTClassifier

tpot = TPOTClassifier( generations  = 10, population_size = 30, verbosity = 2)
tpot.fit(X_train, y_train)

print(tpot.score(X_test, y_test))
tpot.export('tpot_titanic_pipeline.py')

Optimization Progress:  18%|█▊        | 59/330 [00:31<02:49,  1.59pipeline/s]Generation 1 - Current best internal CV score: 0.8247741935483871
Optimization Progress:  27%|██▋       | 90/330 [00:44<02:04,  1.93pipeline/s]Generation 2 - Current best internal CV score: 0.8248387096774193
Optimization Progress:  36%|███▌      | 119/330 [01:04<03:47,  1.08s/pipeline]Generation 3 - Current best internal CV score: 0.8248387096774193
Optimization Progress:  45%|████▌     | 150/330 [01:26<03:59,  1.33s/pipeline]Generation 4 - Current best internal CV score: 0.8248387096774193
Optimization Progress:  55%|█████▍    | 180/330 [01:44<02:19,  1.08pipeline/s]Generation 5 - Current best internal CV score: 0.8248387096774193
Optimization Progress:  64%|██████▎   | 210/330 [02:22<01:34,  1.27pipeline/s]Generation 6 - Current best internal CV score: 0.8279870967741936
Optimization Progress:  73%|███████▎  | 240/330 [02:42<00:57,  1.56pipeline/s]Generation 7 - Current best internal CV score: 0.82798709677