# 数据载入及初步观察

## 载入数据

数据集下载 https://www.kaggle.com/c/titanic/overview  

### 任务一：导入numpy和pandas

In [2]:
import numpy as np
import pandas as pd

### 任务二：载入数据
(1) 使用相对路径载入数据  
(2) 使用绝对路径载入数据

In [3]:
df = pd.read_csv('titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df = pd.read_csv('C:\\Users\\water\\pynb\\data-analysis\\titanic\\train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
df = pd.read_table('titanic/train.csv', sep=',')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 任务三：每1000行为一个数据模块，逐块读取

In [10]:
# 减少内存占用
chunks = pd.read_csv('titanic/train.csv', chunksize=1000)

### 任务四：将表头改成中文，索引改为乘客ID
```
PassengerId => 乘客ID
Survived => 是否幸存
Pclass => 乘客等级(1/2/3等舱位)
Name => 乘客姓名
Sex => 性别
Age => 年龄
SibSp => 堂兄弟/妹个数
Parch => 父母与小孩个数
Ticket => 船票信息
Fare => 票价
Cabin => 客舱
Embarked => 登船港口
```

In [14]:
df = pd.read_csv(
    'titanic/train.csv',
    names=['乘客ID', '是否幸存', '乘客等级(1/2/3等舱位)', '乘客姓名', '性别', '年龄', '兄弟姐妹个数', '父母子女个数', '船票信息', '票价', '客舱', '登船港口'],
    index_col='乘客ID',
    header=0
)
df.head()

Unnamed: 0_level_0,是否幸存,乘客等级(1/2/3等舱位),乘客姓名,性别,年龄,兄弟姐妹个数,父母子女个数,船票信息,票价,客舱,登船港口
乘客ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 初步观察

导入数据后，你可能要对数据的整体结构和样例进行概览，比如说，数据大小、有多少列，各列都是什么格式的，是否包含null等

### 任务一：查看数据的基本信息

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   是否幸存            891 non-null    int64  
 1   乘客等级(1/2/3等舱位)  891 non-null    int64  
 2   乘客姓名            891 non-null    object 
 3   性别              891 non-null    object 
 4   年龄              714 non-null    float64
 5   兄弟姐妹个数          891 non-null    int64  
 6   父母子女个数          891 non-null    int64  
 7   船票信息            891 non-null    object 
 8   票价              891 non-null    float64
 9   客舱              204 non-null    object 
 10  登船港口            889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


### 任务二：观察表格前10行的数据和后15行的数据

In [16]:
df.head(10)

Unnamed: 0_level_0,是否幸存,乘客等级(1/2/3等舱位),乘客姓名,性别,年龄,兄弟姐妹个数,父母子女个数,船票信息,票价,客舱,登船港口
乘客ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [17]:
df.tail(15)

Unnamed: 0_level_0,是否幸存,乘客等级(1/2/3等舱位),乘客姓名,性别,年龄,兄弟姐妹个数,父母子女个数,船票信息,票价,客舱,登船港口
乘客ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20.0,0,0,7534,9.8458,,S
878,0,3,"Petroff, Mr. Nedelio",male,19.0,0,0,349212,7.8958,,S
879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S
880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0,,S
882,0,3,"Markun, Mr. Johann",male,33.0,0,0,349257,7.8958,,S
883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S
885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q


### 任务三：判断数据是否为空，为空的地方返回True，其余地方返回False

In [19]:
df.isnull().head()

Unnamed: 0_level_0,是否幸存,乘客等级(1/2/3等舱位),乘客姓名,性别,年龄,兄弟姐妹个数,父母子女个数,船票信息,票价,客舱,登船港口
乘客ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,False,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,True,False


## 保存数据

### 任务一：将你加载并做出改变的数据，在工作目录下保存为一个新文件train_chinese.csv

In [20]:
df.to_csv('titanic/train_chinese.csv')

# pandas基础

## 知道你的数据叫什么
我们学习pandas的基础操作，那么上一节通过pandas加载之后的数据，其数据类型是什么呢？

### 任务一：pandas中有两个数据类型DataFrame和Series，通过查找简单了解他们。然后自己写一个关于这两个数据类型的小例子[开放题]

In [25]:
dfd = [[1, 'One'], [2, 'Two'], [3, 'Three']]
df = pd.DataFrame(dfd, columns=['number', 'english'])
df.head()

Unnamed: 0,number,english
0,1,One
1,2,Two
2,3,Three


In [26]:
sd = {'One': 1, 'Two': 2, 'Three': 3}
s = pd.Series(sd)
s.head()

One      1
Two      2
Three    3
dtype: int64

### 任务二：根据上节课的方法载入"train.csv"文件

In [55]:
df = pd.read_csv('titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 任务三：查看DataFrame数据的每列的项

In [31]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

### 任务四：查看"cabin"这列的所有项 [有多种方法]

In [34]:
df.Cabin.head()

0     NaN
1     C85
2     NaN
3    C123
4     NaN
Name: Cabin, dtype: object

In [35]:
df['Cabin'].head()

0     NaN
1     C85
2     NaN
3    C123
4     NaN
Name: Cabin, dtype: object

### 任务五：加载文件"test_1.csv"，然后对比"train.csv"，看看有哪些多出的列，然后将多出的列删除

In [51]:
dft = pd.read_csv('titanic/test_1.csv')
dft.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,a
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,100
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,100
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,100
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,100
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,100


In [42]:
del dft['a']
dft.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 任务六： 将['PassengerId','Name','Age','Ticket']这几个列元素隐藏，只观察其他几个列元素

In [53]:
df.drop(['PassengerId', 'Name', 'Age', 'Ticket'], axis='columns').head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,1,0,7.25,,S
1,1,1,female,1,0,71.2833,C85,C
2,1,3,female,0,0,7.925,,S
3,1,1,female,1,0,53.1,C123,S
4,0,3,male,0,0,8.05,,S


In [54]:
df.drop(['PassengerId', 'Name', 'Age', 'Ticket'], axis='columns', inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,1,0,7.25,,S
1,1,1,female,1,0,71.2833,C85,C
2,1,3,female,0,0,7.925,,S
3,1,1,female,1,0,53.1,C123,S
4,0,3,male,0,0,8.05,,S


## 筛选的逻辑

表格数据中，最重要的一个功能就是要具有可筛选的能力，选出我所需要的信息，丢弃无用的信息。  
下面我们还是用实战来学习pandas这个功能。

### 任务一： 我们以"Age"为筛选条件，显示年龄在10岁以下的乘客信息。

In [56]:
df[df['Age'] < 10].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
16,17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.075,,S
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.0,1,2,SC/Paris 2123,41.5792,,C


### 任务二： 以"Age"为条件，将年龄在10岁以上和50岁以下的乘客信息显示出来，并将这个数据命名为midage

In [59]:
midage = df[(df['Age'] > 10) & (df['Age'] < 50)]
midage.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 任务三：将midage的数据中第100行的"Pclass"和"Sex"的数据显示出来

In [64]:
midage.reset_index(drop=True)
midage.loc[[100], ['Pclass', 'Sex']]

Unnamed: 0,Pclass,Sex
100,3,female


### 任务四：使用loc方法将midage的数据中第100，105，108行的"Pclass"，"Name"和"Sex"的数据显示出来

In [68]:
midage.loc[[100, 105, 108], ['Pclass', 'Name', 'Sex']]

Unnamed: 0,Pclass,Name,Sex
100,3,"Petranec, Miss. Matilda",female
105,3,"Mionoff, Mr. Stoytcho",male
108,3,"Rekic, Mr. Tido",male


### 任务五：使用iloc方法将midage的数据中第100，105，108行的"Pclass"，"Name"和"Sex"的数据显示出来

In [73]:
midage.iloc[[100, 105, 108], [2, 3, 4]]

Unnamed: 0,Pclass,Name,Sex
149,2,"Byles, Rev. Thomas Roussel Davids",male
160,3,"Cribb, Mr. John Hatfield",male
163,3,"Calic, Mr. Jovo",male


# 探索性数据分析

开始之前，导入numpy、pandas包和数据

In [75]:
# ignore: 最前面已导入
txt = pd.read_csv('titanic/train_chinese.csv')
txt.head()

Unnamed: 0,乘客ID,是否幸存,乘客等级(1/2/3等舱位),乘客姓名,性别,年龄,兄弟姐妹个数,父母子女个数,船票信息,票价,客舱,登船港口
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 了解你的数据吗？

教材《Python for Data Analysis》第五章

### 任务一：利用Pandas对示例数据进行排序，要求升序

In [80]:
txt.sort_values(by=['票价', '乘客ID'])

Unnamed: 0,乘客ID,是否幸存,乘客等级(1/2/3等舱位),乘客姓名,性别,年龄,兄弟姐妹个数,父母子女个数,船票信息,票价,客舱,登船港口
179,180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0000,,S
263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0000,B94,S
271,272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0000,,S
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0000,,S
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
341,342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0000,C23 C25 C27,S
438,439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0000,C23 C25 C27,S
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C


### 任务二：对泰坦尼克号数据（trian.csv）按票价和年龄两列进行综合排序（降序排列），从数据中你能发现什么

In [86]:
txt.sort_values(by=['票价', '年龄'], ascending=False)
# 票价高存活率高

Unnamed: 0,乘客ID,是否幸存,乘客等级(1/2/3等舱位),乘客姓名,性别,年龄,兄弟姐妹个数,父母子女个数,船票信息,票价,客舱,登船港口
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C
737,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
438,439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0000,C23 C25 C27,S
341,342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0000,C23 C25 C27,S
...,...,...,...,...,...,...,...,...,...,...,...,...
481,482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0000,,S
633,634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0000,,S
674,675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0.0000,,S
732,733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0.0000,,S


### 任务三：利用Pandas进行算术计算，计算两个DataFrame数据相加结果

In [89]:
txt + txt

Unnamed: 0,乘客ID,是否幸存,乘客等级(1/2/3等舱位),乘客姓名,性别,年龄,兄弟姐妹个数,父母子女个数,船票信息,票价,客舱,登船港口
0,2,0,6,"Braund, Mr. Owen HarrisBraund, Mr. Owen Harris",malemale,44.0,2,0,A/5 21171A/5 21171,14.5000,,SS
1,4,2,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",femalefemale,76.0,2,0,PC 17599PC 17599,142.5666,C85C85,CC
2,6,2,6,"Heikkinen, Miss. LainaHeikkinen, Miss. Laina",femalefemale,52.0,0,0,STON/O2. 3101282STON/O2. 3101282,15.8500,,SS
3,8,2,2,"Futrelle, Mrs. Jacques Heath (Lily May Peel)Fu...",femalefemale,70.0,2,0,113803113803,106.2000,C123C123,SS
4,10,0,6,"Allen, Mr. William HenryAllen, Mr. William Henry",malemale,70.0,0,0,373450373450,16.1000,,SS
...,...,...,...,...,...,...,...,...,...,...,...,...
886,1774,0,4,"Montvila, Rev. JuozasMontvila, Rev. Juozas",malemale,54.0,0,0,211536211536,26.0000,,SS
887,1776,2,2,"Graham, Miss. Margaret EdithGraham, Miss. Marg...",femalefemale,38.0,0,0,112053112053,60.0000,B42B42,SS
888,1778,0,6,"Johnston, Miss. Catherine Helen ""Carrie""Johnst...",femalefemale,,2,4,W./C. 6607W./C. 6607,46.9000,,SS
889,1780,2,2,"Behr, Mr. Karl HowellBehr, Mr. Karl Howell",malemale,52.0,0,0,111369111369,60.0000,C148C148,CC


### 任务四：通过泰坦尼克号数据如何计算出在船上最大的家族有多少人？

In [92]:
max(txt['兄弟姐妹个数'] + txt['父母子女个数']) + 1
# 乘客本人

11

### 任务五：学会使用Pandas describe()函数查看数据基本统计信息

In [93]:
txt.describe()

Unnamed: 0,乘客ID,是否幸存,乘客等级(1/2/3等舱位),年龄,兄弟姐妹个数,父母子女个数,票价
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### 任务六：分别看看泰坦尼克号数据集中 票价、父母子女 这列数据的基本统计数据，你能发现什么？

In [95]:
txt['票价'].describe()
# 总票数是891张，75%的票价低于31，平均票价是32多，最高票价是512多，说明票价悬殊，且高价票是极少数

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: 票价, dtype: float64

In [97]:
txt['父母子女个数'].describe()
# 父母子女最多的有6个，但平均数只有0.38，至少75%的乘客没有带父母子女，说明带父母子女的是极少数

count    891.000000
mean       0.381594
std        0.806057
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: 父母子女个数, dtype: float64