In [85]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
pd.set_option('display.max_columns', 60)

# 理解数据
这是一个分类任务，特征包含离散特征和连续特征，数据如下：[Kaggle地址](https://www.kaggle.com/c/titanic/data)。目标是根据数据特征预测一个人是否能在泰坦尼克的沉没事故中存活下来。接下来解释下数据的格式：

```
survival        目标列，是否存活，1代表存活 (0 = No; 1 = Yes)  
pclass          乘坐的舱位级别 (1 = 1st; 2 = 2nd; 3 = 3rd)  
name            姓名 
sex             性别  
age             年龄  
sibsp           兄弟姐妹的数量（乘客中）  
parch           父母的数量（乘客中）  
ticket          票号  
fare            票价  
cabin           客舱  
embarked        登船的港口  
                (C = Cherbourg; Q = Queenstown; S = Southampton)
```

# 导入数据


In [86]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
IDtest = test['PassengerId']
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [87]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [88]:
train_len = len(train)
# reset_index重设置索引
dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [89]:
# 查看数据
dataset.tail()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
1304,,,S,8.05,"Spector, Mr. Woolf",0,1305,3,male,0,,A.5. 3236
1305,39.0,C105,C,108.9,"Oliva y Ocana, Dona. Fermina",0,1306,1,female,0,,PC 17758
1306,38.5,,S,7.25,"Saether, Mr. Simon Sivertsen",0,1307,3,male,0,,SOTON/O.Q. 3101262
1307,,,S,8.05,"Ware, Mr. Frederick",0,1308,3,male,0,,359309
1308,,,C,22.3583,"Peter, Master. Michael J",1,1309,3,male,1,,2668


In [90]:
dataset.dtypes

Age            float64
Cabin           object
Embarked        object
Fare           float64
Name            object
Parch            int64
PassengerId      int64
Pclass           int64
Sex             object
SibSp            int64
Survived       float64
Ticket          object
dtype: object

In [91]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [92]:
dataset.isnull().sum()

Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

# 特征分析

In [93]:
#Pclass
train.groupby('Pclass')['Survived'].count()

Pclass
1    216
2    184
3    491
Name: Survived, dtype: int64

In [94]:
train.groupby('Pclass')['Survived'].sum()/train.groupby('Pclass')['Survived'].count()

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

In [95]:
# fit是对整个数据集做fit
enc_pclass = preprocessing.OneHotEncoder()
enc_pclass.fit(dataset[['Pclass']])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [96]:
pclass_feature =  pd.DataFrame(enc_pclass.transform(dataset[['Pclass']]).toarray(), columns={'Pclass'+str(i) for i in range(len(dataset['Pclass'].unique()))}, dtype=int)

In [97]:
pclass_feature.head()

Unnamed: 0,Pclass2,Pclass0,Pclass1
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [98]:
# Name 挖掘出性别、年龄、家族等
# 对少的数据可视为同一类others 
dataset[['Name']].head()

Unnamed: 0,Name
0,"Braund, Mr. Owen Harris"
1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,"Heikkinen, Miss. Laina"
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,"Allen, Mr. William Henry"


In [99]:
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]] #提取姓名
dataset["Title"] = pd.Series(dataset_title)
dataset["Title"].head()

0      Mr
1     Mrs
2    Miss
3     Mrs
4      Mr
Name: Title, dtype: object

In [100]:
dataset.groupby(["Title"])['Name'].count() # 查看数据分布

Title
Capt              1
Col               4
Don               1
Dona              1
Dr                8
Jonkheer          1
Lady              1
Major             2
Master           61
Miss            260
Mlle              2
Mme               1
Mr              757
Mrs             197
Ms                2
Rev               8
Sir               1
the Countess      1
Name: Name, dtype: int64

In [101]:
dataset["Title"] = dataset["Title"].replace(
    ['Lady', 'the Countess','Countess','Capt', 
     'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset["Title"] = dataset["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
dataset["Title"] = dataset["Title"].astype(int)
# dataset['Title'].unique()
dataset.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,2
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,1
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,1
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,1
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,2


In [41]:
enc_name = preprocessing.OneHotEncoder() # 引入onehot编码类
enc_name.fit(dataset[['Title']])
Name_feature = pd.DataFrame(enc_name.transform(dataset[['Title']]).toarray(),
                              columns=['Title'+str(i) for i in range(len(dataset['Title'].unique()))],dtype=int)
Name_feature.head()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,Title0,Title1,Title2,Title3
0,0,0,1,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,0,1,0
