In [273]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

In [274]:
#读取数据
Train = pd.read_csv('train.csv')
Test = pd.read_csv('test.csv')
X = Train.drop('Survived', axis=1)
Y = Train['Survived']
Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [275]:
Test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [276]:
#合并数据方便处理
X["is_train"] = 1
Test["is_train"] = 0
All = pd.concat((X, Test), axis=0)
All.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
 11  is_train     1309 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 132.9+ KB


In [277]:
All = All.drop('PassengerId', axis=1)

In [278]:
pattern = r".*, (.*?)\.."
All['Title'] = All['Name'].str.extract(pattern)
title_counts = All['Title'].value_counts()
rare_title = title_counts[title_counts <= 10].index.tolist()
def replace(n):
    if n in rare_title:
        return "Rare"
    return n
All['Title'] = All['Title'].apply(replace)
All['Title'].value_counts()
All = All.drop("Name", axis=1)

In [279]:
All['Sex'] = All['Sex'].map({'male': 1, 'female': 0})
All.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train,Title
0,3,1,22.0,1,0,A/5 21171,7.25,,S,1,Mr
1,1,0,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs
2,3,0,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss
3,1,0,35.0,1,0,113803,53.1,C123,S,1,Mrs
4,3,1,35.0,0,0,373450,8.05,,S,1,Mr


In [280]:
All = All.drop('Ticket', axis=1)
All.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,is_train,Title
0,3,1,22.0,1,0,7.25,,S,1,Mr
1,1,0,38.0,1,0,71.2833,C85,C,1,Mrs
2,3,0,26.0,0,0,7.925,,S,1,Miss
3,1,0,35.0,1,0,53.1,C123,S,1,Mrs
4,3,1,35.0,0,0,8.05,,S,1,Mr


In [281]:
All = All.drop('Cabin', axis=1)
All.isnull().sum()

Pclass        0
Sex           0
Age         263
SibSp         0
Parch         0
Fare          1
Embarked      2
is_train      0
Title         0
dtype: int64

In [282]:
X_tran = All[All['is_train'] == 1]
Test_tran = All[All['is_train'] == 0]
X_tran = X_tran.drop('is_train', axis=1)
Test_tran = Test_tran.drop('is_train', axis=1)

In [283]:
#分割训练集与验证集
x_train, x_val, y_train, y_val = train_test_split(X_tran, Y, test_size=0.2, random_state=42)

In [284]:
mode = x_train['Embarked'].mode().iloc[0]
x_train['Embarked'] = x_train['Embarked'].fillna(mode)
x_val['Embarked'] = x_val['Embarked'].fillna(mode)
median = x_train['Fare'].median()
Test_tran['Fare'] = Test_tran['Fare'].fillna(median)

In [285]:
#创建管道
Age_transform = Pipeline(steps=[['SimpleImputer', SimpleImputer(strategy='median')],
                                ['KBinsDiscretizer', KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile')],])
categories_columns = ['Embarked', 'Title']
categories_transform = Pipeline(steps=[('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))])
transformer = ColumnTransformer(transformers=[('Categories', categories_transform, categories_columns),
                                              ('Numerical', Age_transform, ['Age'])])
clf = Pipeline(steps=[('transform', transformer),
                      ('Classifier', XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100))])

In [286]:
#训练模型
clf.fit(x_train, y_train)



0,1,2
,steps,"[('transform', ...), ('Classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('Categories', ...), ('Numerical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,n_bins,4
,encode,'ordinal'
,strategy,'quantile'
,quantile_method,'warn'
,dtype,
,subsample,200000
,random_state,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [287]:
#模型评估
y_predict = clf.predict(x_val)
acc = accuracy_score(y_val, y_predict)
precision = precision_score(y_val, y_predict)
recall = recall_score(y_val, y_predict)
f1 = f1_score(y_val, y_predict)
y_train_predict = clf.predict(x_train)
acc_train = accuracy_score(y_train, y_train_predict)
print('Accuracy Train:', acc_train)
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy Train: 0.7991573033707865
Accuracy: 0.770949720670391
Precision: 0.7142857142857143
Recall: 0.7432432432432432
F1: 0.7284768211920529


In [288]:
Test_predict = clf.predict(Test_tran)
Test_predict = pd.DataFrame(Test_predict)
Test['Survived'] = Test_predict.values
Submit = Test[['PassengerId', 'Survived']]
Submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [289]:
Submit.to_csv('submission1.csv', index=False)