In [1]:
import pandas as pd

In [54]:
df_train: pd.DataFrame = pd.read_csv('data/titanic/train.csv')
df_test: pd.DataFrame = pd.read_csv('data/titanic/test.csv')
# df_all: pd.DataFrame = pd.concat([df_train, df_test], ignore_index=True, sort=True)

In [55]:
print("Train Size:", df_train.shape)
print("Test Size:", df_test.shape)

In [56]:
df_train.head()

In [57]:
df_train.describe()

In [58]:
df_train.info()

In [59]:
# 缺失值用 平均值代替
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())

In [60]:
# 对Cabin客舱号进行缺失值处理：因为没办法填充，用Unknown标识出来
df_train['Cabin'].fillna('Unknown', inplace=True)

In [61]:
# 对Embarked登船港口进行缺失值处理
print(df_train['Embarked'].value_counts())
# 只缺了2条,就用众数S填充
df_train['Embarked'] = df_train['Embarked'].fillna('S')

In [62]:
df_train.head()

In [63]:
df_train.info()

In [73]:
# 删除姓名、ID、船票、客舱等无关信息，axis=0 删除行，=1 删除列
data = df_train.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1)
# 使用get_dummies 对性别与登船港口 进行one-hot编码
data_dummy = pd.get_dummies(data[['Sex', 'Embarked']])
print(data_dummy.head())

# 编码后的数据与原来其他列的数据重新拼接起来
data = pd.concat([data, data_dummy], axis=1)
# 删除编码前的 性别与登船港 列
data.drop(['Sex', 'Embarked'], axis=1, inplace=True)
print(data.head())

In [74]:
from sklearn.model_selection import train_test_split

# x为特征,y为标签
x = data.drop('Survived', axis=1)
y = data['Survived']

# 分割训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [76]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(x_train, y_train)

In [78]:
print("训练集acc准确率:", model.score(x_train, y_train))
print("测试集acc准确率:", model.score(x_test, y_test))

In [80]:
# model.score(x_test, y_test) 等价于 下面2步
from sklearn.metrics import accuracy_score
y_predict = model.predict(x_test)
print("测试集acc准确率:", accuracy_score(y_test, y_predict))

In [81]:
from sklearn.metrics import classification_report
# 查看模型的性能指标
print(classification_report(y_test, y_predict))