In [4]:
# 导入后续需要用到的库文件
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
# 这个时归一化
from sklearn.preprocessing import StandardScaler
# 划分训练集/测试集
from sklearn.model_selection import train_test_split
# 分类模型
from sklearn.linear_model import LogisticRegression
# 混淆矩阵
from sklearn.metrics import confusion_matrix

In [5]:
# 读取数据并查看
data = pd.read_csv("./Kaggle_Titanic-master/train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Survived列即为label值，是否获救，其他可以看成待选择的特征列

In [6]:
# 数据不完整，有部分列的值缺失的情况，检查NA值的情况
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
# 选取数据集中有用的特征，这里抛弃部分特征
data = data.drop(labels=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [8]:
print("去除缺失值前：",data.shape)
# 去除有缺失值的行
data = data.dropna()
print("去除缺失值后：",data.shape)

去除缺失值前： (891, 8)
去除缺失值后： (712, 8)


In [9]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


可以看到，sex ,embarked列的值为离散值，而非数组，无法进入模型训练，因此还要处理离散值

In [10]:
# 分类变量编码
data_dummy = pd.get_dummies(data[['Sex', 'Embarked']])

In [11]:
data_dummy.head()

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1
1,1,0,1,0,0
2,1,0,0,0,1
3,1,0,0,0,1
4,0,1,0,0,1


这个函数做了one-hot编码，什么意思呢？就是说Sex这个变量有两个取值，那就变化为一个二维变量（0，0），如果是female, 那第一列为1，即（1，0），如果是male,则为（0，1）

同理，embarked列有三个离散取值，那变化为一个三维变量

In [12]:
# 取剩下的列
data_conti = pd.DataFrame(data, columns=['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], index=data.index)
# 进行拼接
data = data_conti.join(data_dummy)

In [13]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]
# test_size表示划分比例，train:test = 7 : 3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [14]:
# 标准化
stdsc = StandardScaler()
X_train_conti_std = stdsc.fit_transform(X_train[['Age', 'SibSp', 'Parch', 'Fare']])
X_test_conti_std = stdsc.fit_transform(X_test[['Age', 'SibSp', 'Parch', 'Fare']])
print(X_train_conti_std.shape)
print(X_test_conti_std.shape)
# 将ndarray转为dataframe
X_train_conti_std = pd.DataFrame(data=X_train_conti_std, columns=['Age', 'SibSp', 'Parch', 'Fare'], index=X_train.index)
X_test_conti_std = pd.DataFrame(data=X_test_conti_std, columns=['Age', 'SibSp', 'Parch', 'Fare'], index=X_test.index)
X_test_conti_std.head()

(498, 4)
(214, 4)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,Age,SibSp,Parch,Fare
423,-0.20225,0.59205,0.704839,-0.428308
178,-0.068906,-0.581086,-0.521466,-0.45796
305,-2.007726,0.59205,1.931145,2.476491
292,0.331126,-0.581086,-0.521466,-0.460607
592,1.064517,-0.581086,-0.521466,-0.579743


In [15]:
# 有序分类变量Pclass
X_train_cat = X_train[['Pclass']]
X_test_cat = X_test[['Pclass']]
# 无序已编码的分类变量
X_train_dummy = X_train[['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
X_test_dummy = X_test[['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
# 拼接为dataframe
X_train_set = [X_train_cat, X_train_conti_std, X_train_dummy]
X_test_set = [X_test_cat, X_test_conti_std, X_test_dummy]
X_train = pd.concat(X_train_set, axis=1)
X_test = pd.concat(X_test_set, axis=1)

In [16]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
202,3,0.348671,-0.543073,-0.501181,-0.50843,0,1,0,0,1
439,2,0.137603,-0.543073,-0.501181,-0.435845,0,1,0,0,1
102,1,-0.565954,-0.543073,0.648995,0.774836,0,1,0,0,1
118,1,-0.354887,-0.543073,0.648995,3.860714,0,1,1,0,0
625,1,2.248277,-0.543073,-0.501181,-0.040291,0,1,0,0,1


In [17]:
# 基于训练集使用逻辑回归建模
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
 
# 将模型应用于测试集并查看混淆矩阵
y_pred = classifier.predict(X_test)
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[104  21]
 [ 29  60]]




# 也可以自己手动算

In [18]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [19]:
print("Precision: ",precision_score(y_pred=y_pred,y_true=y_test))
print("Recall：",recall_score(y_pred=y_pred,y_true=y_test))
print("accuracy：",accuracy_score(y_pred=y_pred,y_true=y_test))
print("f1 score：",f1_score(y_pred=y_pred,y_true=y_test))

Precision:  0.7407407407407407
Recall： 0.6741573033707865
accuracy： 0.7663551401869159
f1 score： 0.7058823529411764
