In [97]:
# 使用的库
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [98]:
# 导入训练和测试数据
train_data = pd.read_csv('../data/titanic/train.csv',encoding='utf-8',iterator=False)
test_data = pd.read_csv('../data/titanic/test.csv',encoding='utf-8',iterator=False)

In [99]:
# 查看数据概况
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [100]:
# 缺失值统计
def get_missing_data_info(data):
    """
    获取缺失值统计
     Parameters
    ----------
    data : Series or DataFrame
        要统计的数据

    Returns
    -------
    Series or DataFrame
        统计总数和占比
    """
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    

In [101]:
missing_data = get_missing_data_info(train_data)
missing_data.head(10)


Unnamed: 0,Total,Percent
Cabin,687,0.771044
Age,177,0.198653
Embarked,2,0.002245
Fare,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


In [102]:
# 分析缺失值
# Cabin 的缺失值达到了70%以上,删除
# Name 可以深度挖掘是否是富人,通过家族姓氏,这里我们先删除
# Ticket 船票号码可以深度挖掘是否靠窗等,这里我们先删除
# Age 的缺失值可以通过平均值的填充
# Embarked 的缺失值可以通过最高频率值来填充

# 删除Cabin
delete_columns = ['Cabin','Name','Ticket']
train_data.drop(delete_columns,axis=1,inplace=True)
test_data.drop(delete_columns,axis=1,inplace=True)

# 使用sklearn的impute来填充缺失值
mean_columns = ['Age','Fare']
impute_mean = SimpleImputer(missing_values=np.nan,strategy="mean")
train_data[mean_columns] = impute_mean.fit_transform(train_data[mean_columns])
test_data[mean_columns] = impute_mean.transform(test_data[mean_columns])

most_frequent_columns = ['Embarked']
impute_most_frequent = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
train_data[most_frequent_columns] = impute_most_frequent.fit_transform(train_data[most_frequent_columns])
test_data[most_frequent_columns] = impute_most_frequent.transform(test_data[most_frequent_columns])

In [103]:
# 数据归一化处理
scaler_columns = ['Age','Fare']

std_scaler = StandardScaler()
train_data[scaler_columns] = std_scaler.fit_transform(train_data[scaler_columns])
test_data[scaler_columns] = std_scaler.transform(test_data[scaler_columns])

In [104]:
# TODO: 异常值处理

In [105]:
# 分类数据编码
ont_hot_columns = ['Pclass','Sex','Embarked']
train_data = pd.get_dummies(train_data,columns=ont_hot_columns)
test_data = pd.get_dummies(test_data,columns=ont_hot_columns)

In [107]:
# 删除没用的列

train_data.head(10)


Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,1,0,-0.592481,1,0,-0.502445,0,1,1,0,1
1,2,1,0.638789,1,0,0.786845,0,0,0,0,0
2,3,1,-0.284663,0,0,-0.488854,0,1,0,0,1
3,4,1,0.407926,1,0,0.42073,0,0,0,0,1
4,5,0,0.407926,0,0,-0.486337,0,1,1,0,1
5,6,0,0.0,0,0,-0.478116,0,1,1,1,0
6,7,0,1.870059,0,0,0.395814,0,0,1,0,1
7,8,0,-2.131568,3,1,-0.224083,0,1,1,0,1
8,9,1,-0.207709,0,2,-0.424256,0,1,0,0,1
9,10,1,-1.208115,1,0,-0.042956,1,0,0,0,0
