In [12]:
import numpy as np
import pandas as pd

In [13]:
df = pd.read_csv("../data/titanic_dataset.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [14]:
df["Age"].count()

332

In [15]:
df["Age"].describe()

count    332.000000
mean      30.272590
std       14.181209
min        0.170000
25%       21.000000
50%       27.000000
75%       39.000000
max       76.000000
Name: Age, dtype: float64

In [16]:
df["Age"].value_counts()

21.0    17
24.0    17
22.0    16
30.0    15
18.0    13
        ..
76.0     1
28.5     1
22.5     1
62.0     1
38.5     1
Name: Age, Length: 79, dtype: int64

In [17]:
df["Pclass"].value_counts()

3    218
1    107
2     93
Name: Pclass, dtype: int64

In [18]:
df["Pclass"].value_counts().index.tolist()

[3, 1, 2]

根据Pclass的值分组, 按照aggfunc()查看Age的值

pivot_table()相当于简化版的分组聚合函数

In [19]:
df_pivot_table = df.pivot_table(values="Age",
                                index="Pclass",
                                aggfunc=np.max)
df_pivot_table

Unnamed: 0_level_0,Age
Pclass,Unnamed: 1_level_1
1,76.0
2,63.0
3,60.5


In [20]:
df_pivot_table = df.pivot_table(values=["Age", "Survived"],
                                index="Pclass",
                                aggfunc=[np.mean, np.sum])
df_pivot_table

Unnamed: 0_level_0,mean,mean,sum,sum
Unnamed: 0_level_1,Age,Survived,Age,Survived
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,40.918367,0.46729,4010.0,50
2,28.7775,0.322581,2532.42,30
3,24.027945,0.330275,3508.08,72


# 分组聚合

In [21]:
df[["Age", "Survived", "Pclass"]].groupby(["Pclass"]).agg(
    {
        'Age': [np.mean, np.max, np.min],
        'Survived': [np.sum]
    })

Unnamed: 0_level_0,Age,Age,Age,Survived
Unnamed: 0_level_1,mean,amax,amin,sum
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,40.918367,76.0,6.0,50
2,28.7775,63.0,0.92,30
3,24.027945,60.5,0.17,72


# 自定义函数

In [22]:
i = 0


def getDiffRowValue(series):
    global i
    result = series[i]
    i += 1
    return result


df.apply(func=getDiffRowValue).to_frame()

Unnamed: 0,0
PassengerId,892
Survived,1
Pclass,2
Name,"Wirz, Mr. Albert"
Sex,female
Age,14.0
SibSp,0
Parch,1
Ticket,2657
Fare,24.15


**案例1: 离散化处理**

In [32]:
def agege18(age):
    if age >= 18:
        return 1
    else:
        return 0


df['agege18'] = df['Age'].apply(agege18)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,agege18
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [27]:
def formatConversion(x):
    if x == 1:
        return 'Upper'
    if x == 2:
        return 'Midder'
    if x == 3:
        return 'Loser'


# apply将索引传入, 所以不能传入df[['Pclass']], 否则只有一列Series
df['Pclass'] = df['Pclass'].apply(func=formatConversion)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,Loser,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,Loser,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,Midder,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,Loser,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,Loser,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,Loser,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,Upper,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,Loser,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,Loser,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# 数据映射

In [30]:
df['Pclass'] = df['Pclass'].map({
    'Upper': 1,
    'Midder': 2,
    'Loser': 3
})
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S
