In [None]:
# 2. 处理泰坦尼克号数据集 titanic.tgz
# 这将提供两个CSV文件，train.csv和test.csv
# 最后目标：目标是训练一个可以根据其他列预测Survived（是否存活）列的分类器

In [100]:
# 读取压缩文件
import tarfile
with tarfile.open("datasets/titanic.tgz") as titanic_tar:
    titanic_tar.extractall(path="datasets")

In [101]:
import pandas as pd
train_data = pd.read_csv("datasets/titanic/train.csv")
test_data = pd.read_csv("datasets/titanic/test.csv")

## 数据集的字段说明
- PassengerId ：  每位乘客的唯一标识符
- Survived    ：  是否幸存（1：幸存，0：未幸存）
- Pclass      ：  乘客的舱位等级（1-一等舱，2-二等舱，3-三等舱）
- Name        ：  姓名
- Sex         ：  性别，male,female
- Age         :   年龄
- SibSp       ：  与乘客一起旅行的兄弟姐妹或配偶的数量
- Parch       ：  与乘客一起旅行的父母或儿童人数
- Ticket      ：  票证号
- Fare        ：  票价
- Cabin       ：  乘客所住的客舱编号
- Embarked    ：  乘客登船的港口（C-瑟堡（法国），Q-皇后镇（爱尔兰），S-南安普顿（英格兰））

In [6]:
# 简单看下数据
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699113,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526507,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.4167,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.1667,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [9]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [15]:
# 分离特征和标签, 因为数据集中提供了测试集，所以我们可以不用sklearn库来划分训练集和测试集
X_train, y_train = train_data.loc[:, ~train_data.columns.isin(["Survived"])], train_data["Survived"]
X_train.shape, y_train.shape

((891, 11), (891,))

## 使用一些特征，来对是否存活进行预测

In [25]:
# 分类特征
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
cat_attribute = ["Sex", "Cabin", "Embarked"]
cat_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown="ignore")
)

sta_pipeline = make_pipeline(
    SimpleImputer(),
    StandardScaler()
)

In [26]:
# 列转换器
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_selector
preprocessing = ColumnTransformer([
    ("cat", cat_pipeline, cat_attribute),
    ("standard_sca", sta_pipeline, make_column_selector(dtype_include=np.number))
])

### 用逻辑回归跑

In [27]:
# 分类问题，跑逻辑回归
from sklearn.linear_model import LogisticRegression
logistic_reg = make_pipeline(
    preprocessing, LogisticRegression()
)
logistic_reg.fit(X_train, y_train)

In [54]:
# 对测试集做预测 test.csv,并写到一个新csv文件里
test_data_predict = logistic_reg.predict(test_data)
test_data_copy = test_data.copy()
test_data_copy["Survived"] = test_data_predict
test_data_copy.to_csv("datasets/titanic/test_predict.csv", index=False)

### 用决策树跑

In [56]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_clf_model = make_pipeline(
    preprocessing, DecisionTreeClassifier()
)
decision_tree_clf_model.fit(X_train, y_train)

### 用KNN跑

In [57]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = make_pipeline(
    preprocessing, KNeighborsClassifier()
)
knn_model.fit(X_train, y_train)

## 模型的评估

In [61]:
# 使用 test_augmented.csv进行模型评估
from sklearn.metrics import f1_score, accuracy_score
metric_data = pd.read_csv("datasets/titanic/test_augmented.csv")
y_true = metric_data["Survived"]
X_test = metric_data.loc[:, ~metric_data.columns.isin(["Survived"])]

print("==========逻辑回归==============")
accur_score = accuracy_score(y_true, logistic_reg.predict(X_test))
f1_scores = f1_score(y_true, logistic_reg.predict(X_test))
print(f"模型准确率为：{accur_score * 100:.3f} %")
print(f"模型的F1分数为：{f1_scores:.3f}")

print("==========决策树分类==============")
accur_score = accuracy_score(y_true, decision_tree_clf_model.predict(X_test))
f1_scores = f1_score(y_true, decision_tree_clf_model.predict(X_test))
print(f"模型准确率为：{accur_score * 100:.3f} %")
print(f"模型的F1分数为：{f1_scores:.3f}")

print("==========KNN分类==============")
accur_score = accuracy_score(y_true, knn_model.predict(X_test))
f1_scores = f1_score(y_true, knn_model.predict(X_test))
print(f"模型准确率为：{accur_score * 100:.3f} %")
print(f"模型的F1分数为：{f1_scores:.3f}")

模型准确率为：76.555 %
模型的F1分数为：0.692
模型准确率为：75.120 %
模型的F1分数为：0.653
模型准确率为：77.273 %
模型的F1分数为：0.669


# 一些有趣的尝试
- 男性和女性的存活率谁更高
- 成年、未成年和老年的存活率是否不同
- 根据乘客随行人员的类别和数量来看存活率是否不同

In [87]:
# 防止之前的操作对原数据有改动，所以再读取一遍数据
data = pd.read_csv("datasets/titanic/train.csv")
total_passengers = len(data)
print(f"总乘客数为：{total_passengers}")

总乘客数为：891


In [88]:
# 性别和存活率的关系
male_sur_rate = np.sum((data["Sex"] == "male") & (data["Survived"] == 1)) / total_passengers
female_sur_rate = np.sum((data["Sex"] == "female") & (data["Survived"] == 1)) / total_passengers
print(f"男性存活率为：{male_sur_rate * 100:.3f} %")
print(f"女性存活率为：{female_sur_rate * 100:.3f} %")

男性存活率为：12.233 %
女性存活率为：26.150 %


In [90]:
# 年龄和存活率的关系
# 先看一下年龄列的统计信息，再看区间分类
print("======Age列的简单描述=====")
print(data["Age"].describe())

count    714.000000
mean      29.699113
std       14.526507
min        0.416700
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64


In [91]:
# 0 - 18 分为一类  18 - 50 分为一类，50往上分为一类
age_0_to_18 = (data["Age"] > 0) & (data["Age"] <= 18)
age_18_to_50 = (data["Age"] > 18) & (data["Age"] <= 50)
age_gt_50 = (data["Age"] > 50)
age_range1_rate = np.sum(age_0_to_18) / total_passengers
age_range2_rate = np.sum(age_18_to_50) / total_passengers
age_range3_rate = np.sum(age_gt_50) / total_passengers
print(f"年龄为 0 - 18岁的存活率为：{age_range1_rate * 100:.3f} %")
print(f"年龄为 18 - 50岁的存活率为：{age_range2_rate * 100:.3f} %")
print(f"年龄为 50 + 岁的存活率为：{age_range3_rate * 100:.3f} %")

年龄为 0 - 18岁的存活率为：15.600 %
年龄为 18 - 50岁的存活率为：57.351 %
年龄为 50 + 岁的存活率为：7.183 %


## 随行人是 兄弟或配偶

In [94]:
# 根据乘客随行人员是兄弟姐妹或配偶的人数进行分类
# 看一下兄弟姐妹列人数列的描述
print("======SibSp列的简单描述=====")
print(data["SibSp"].describe())
print("======SibSp列的频次统计=====")
print(data["SibSp"].value_counts())

count    891.000000
mean       0.523008
std        1.102743
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: SibSp, dtype: float64
SibSp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: count, dtype: int64


In [96]:
# 根据随行人数<=1和大于1分类
sib_person_leq_1 = data["SibSp"] <= 1
sib_person_gt_1 = data["SibSp"] > 1
sib_leq1_rate = np.sum(sib_person_leq_1) / total_passengers
sib_gt1_rate = np.sum(sib_person_gt_1) / total_passengers
print(f"随行人数小于等于1的人的存活率为：{sib_leq1_rate * 100:.3f} %")
print(f"随行人数大于1的人的存活率为：{sib_gt1_rate * 100:.3f} %")

随行人数小于等于1的人的存活率为：91.695 %
随行人数大于1的人的存活率为：8.305 %


## 随行人是 父母或子女

In [97]:
# 看一下父母子女人数列的描述
print("======Parch列的简单描述=====")
print(data["Parch"].describe())
print("======Parch列的频次统计=====")
print(data["Parch"].value_counts())

count    891.000000
mean       0.381594
std        0.806057
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Parch, dtype: float64
Parch
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: count, dtype: int64


In [99]:
# 根据随行人数<=1和大于1分类
par_person_leq_1 = data["Parch"] <= 1
par_person_gt_1 = data["Parch"] > 1
par_leq1_rate = np.sum(par_person_leq_1) / total_passengers
par_gt1_rate = np.sum(par_person_gt_1) / total_passengers
print(f"随行人数小于等于1的人的存活率为：{par_leq1_rate * 100:.3f} %")
print(f"随行人数大于1的人的存活率为：{par_gt1_rate * 100:.3f} %")

随行人数小于等于1的人的存活率为：89.338 %
随行人数大于1的人的存活率为：10.662 %
