## 1.1如何导入数据

In [2]:
# 在ipynb文件中导入pandas
import pandas as pd

In [3]:
# 加载csv数据集
tips = pd.read_csv('./data/tips.csv')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


## 1.2查看导入的数据类型

In [4]:
# 查看类型
type(tips)

pandas.core.frame.DataFrame

In [5]:
# sep参数指定tsv文件的列元素分割为|t,默认sep参数是
china = pd.read_csv('./data/china.tsv',sep='\t')
china

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,China,Asia,1952,44.0,556263527,400.448611
1,China,Asia,1957,50.54896,637408000,575.987001
2,China,Asia,1962,44.50136,665770000,487.674018
3,China,Asia,1967,58.38112,754550000,612.705693
4,China,Asia,1972,63.11888,862030000,676.900092
5,China,Asia,1977,63.96736,943455000,741.23747
6,China,Asia,1982,65.525,1000281000,962.421381
7,China,Asia,1987,67.274,1084035000,1378.904018
8,China,Asia,1992,68.69,1164970000,1655.784158
9,China,Asia,1997,70.426,1230075000,2289.234136


In [6]:
type(china)

pandas.core.frame.DataFrame

## 2 DataFram的行列标签和行列位置编号

In [7]:
# 获取DataFram的行标签
china.index

RangeIndex(start=0, stop=12, step=1)

In [8]:
# 获取DataFram的列标签
china.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [9]:
# Data设置行标签时，并不会改变原来的DataFram，而是返回原来的副本
china_df = china.set_index('year')
china_df

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,China,Asia,44.0,556263527,400.448611
1957,China,Asia,50.54896,637408000,575.987001
1962,China,Asia,44.50136,665770000,487.674018
1967,China,Asia,58.38112,754550000,612.705693
1972,China,Asia,63.11888,862030000,676.900092
1977,China,Asia,63.96736,943455000,741.23747
1982,China,Asia,65.525,1000281000,962.421381
1987,China,Asia,67.274,1084035000,1378.904018
1992,China,Asia,68.69,1164970000,1655.784158
1997,China,Asia,70.426,1230075000,2289.234136


In [10]:
# 查看行标签
china_df.index

Int64Index([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
            2007],
           dtype='int64', name='year')

## 3 DataFram行位置编号和列位置编号

### 3.1 loc获取指定行列的数据（标签指定）

In [11]:
# 获取行标签为1952,1962,1972行的country,pop,gdpPercap列的数据[[行],[列]]
china_df.loc[[1952,1962,1972],['country','pop','gdpPercap']]

Unnamed: 0_level_0,country,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,China,556263527,400.448611
1962,China,665770000,487.674018
1972,China,862030000,676.900092


In [12]:
# 获取行标签为1952,1962,1972行的所有列的数据
china_df.loc[[1952,1962,1972]]

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,China,Asia,44.0,556263527,400.448611
1962,China,Asia,44.50136,665770000,487.674018
1972,China,Asia,63.11888,862030000,676.900092


In [13]:
# 获取所有行的country,pop,gdpParcap列的数据
china_df.loc[:,['country','pop','gdpPercap']]

Unnamed: 0_level_0,country,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,China,556263527,400.448611
1957,China,637408000,575.987001
1962,China,665770000,487.674018
1967,China,754550000,612.705693
1972,China,862030000,676.900092
1977,China,943455000,741.23747
1982,China,1000281000,962.421381
1987,China,1084035000,1378.904018
1992,China,1164970000,1655.784158
1997,China,1230075000,2289.234136


In [14]:
# 获取行标签为1957行的所有列的数据
china_df1 = china_df.loc[1957]
china_df1

country           China
continent          Asia
lifeExp        50.54896
pop           637408000
gdpPercap    575.987001
Name: 1957, dtype: object

In [15]:
# 因为只有一行，所以数据类型为serier
type(china_df1)

pandas.core.series.Series

In [16]:
# 获取行标签为1957行的所有列的数据
china_df2 = china_df.loc[[1957]]
china_df2

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1957,China,Asia,50.54896,637408000,575.987001


In [17]:
# 使用以上方法无论是单行还是多行,结果均为DataFram类型
type(china_df2)

pandas.core.frame.DataFrame

In [18]:
# 获取行标签为1957行的lifeExp列的数据
a = china_df.loc[[1957],'lifeExp']
a

year
1957    50.54896
Name: lifeExp, dtype: float64

In [19]:
type(a)

pandas.core.series.Series

### 3.2示例5的另外两种实现方式

In [20]:
# 获取行标签为1957行的lifeExp列的数据
b = china_df.loc[1957,['lifeExp']]
b

lifeExp    50.54896
Name: 1957, dtype: object

In [21]:
type(b)

pandas.core.series.Series

In [22]:
c = china_df.loc[1957,'lifeExp']
c

50.54896

In [23]:
type(c)

numpy.float64

In [24]:
d = china_df.loc[[1957],['lifeExp']]
d

Unnamed: 0_level_0,lifeExp
year,Unnamed: 1_level_1
1957,50.54896


In [25]:
type(d)

pandas.core.frame.DataFrame

## 4 iloc函数获取指定行列的数据（位置编号指定）

In [26]:
# 获取行位置为0，2，4行的0，1，2列的数据
china_df.iloc[[0,2,4],[0,1,2]]

Unnamed: 0_level_0,country,continent,lifeExp
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,China,Asia,44.0
1962,China,Asia,44.50136
1972,China,Asia,63.11888


In [27]:
# 获取行位置为0，2，4行的所有列数据
china_df.iloc[[0,2,4]]

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,China,Asia,44.0,556263527,400.448611
1962,China,Asia,44.50136,665770000,487.674018
1972,China,Asia,63.11888,862030000,676.900092


In [28]:
# 获取所有行的列位置为0、1、2列的数据
china_df.iloc[:,[0,1,2]]

Unnamed: 0_level_0,country,continent,lifeExp
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,China,Asia,44.0
1957,China,Asia,50.54896
1962,China,Asia,44.50136
1967,China,Asia,58.38112
1972,China,Asia,63.11888
1977,China,Asia,63.96736
1982,China,Asia,65.525
1987,China,Asia,67.274
1992,China,Asia,68.69
1997,China,Asia,70.426


In [29]:
# 获取行位置为1行的所有列的数据
china_df.iloc[[1]]

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1957,China,Asia,50.54896,637408000,575.987001


In [30]:
china_df.iloc[1]

country           China
continent          Asia
lifeExp        50.54896
pop           637408000
gdpPercap    575.987001
Name: 1957, dtype: object

In [31]:
# 获取行位置为1行的列位置为2列的数据
china_df.iloc[[1],2]

year
1957    50.54896
Name: lifeExp, dtype: float64

In [32]:
china_df.iloc[1,[2]]

lifeExp    50.54896
Name: 1957, dtype: object

In [33]:
type(china_df.iloc[1,[2]])

pandas.core.series.Series

In [34]:
china_df.iloc[1,2]

50.54896

In [35]:
type(china_df.iloc[1,2])

numpy.float64

In [36]:
china_df.iloc[[1],[2]] 

Unnamed: 0_level_0,lifeExp
year,Unnamed: 1_level_1
1957,50.54896


## 5 loc与iloc的切片操作

In [37]:
china_df.loc[1952:1962,'country':'lifeExp']

Unnamed: 0_level_0,country,continent,lifeExp
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,China,Asia,44.0
1957,China,Asia,50.54896
1962,China,Asia,44.50136


In [38]:
china_df.iloc[0:3,0:3]

Unnamed: 0_level_0,country,continent,lifeExp
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,China,Asia,44.0
1957,China,Asia,50.54896
1962,China,Asia,44.50136


## 6 []语法获取指定行列的数据

In [39]:
# 获取所有行的country,pop,gdpPercap列的数据
china_df[['country','pop','gdpPercap']]

Unnamed: 0_level_0,country,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,China,556263527,400.448611
1957,China,637408000,575.987001
1962,China,665770000,487.674018
1967,China,754550000,612.705693
1972,China,862030000,676.900092
1977,China,943455000,741.23747
1982,China,1000281000,962.421381
1987,China,1084035000,1378.904018
1992,China,1164970000,1655.784158
1997,China,1230075000,2289.234136


In [40]:
# 根据列标签获取所有行的对应列的数据
data1 = china_df['pop']
data1

year
1952     556263527
1957     637408000
1962     665770000
1967     754550000
1972     862030000
1977     943455000
1982    1000281000
1987    1084035000
1992    1164970000
1997    1230075000
2002    1280400000
2007    1318683096
Name: pop, dtype: int64

In [41]:
type(data1)

pandas.core.series.Series

In [42]:
china_df[['pop']]

Unnamed: 0_level_0,pop
year,Unnamed: 1_level_1
1952,556263527
1957,637408000
1962,665770000
1967,754550000
1972,862030000
1977,943455000
1982,1000281000
1987,1084035000
1992,1164970000
1997,1230075000


In [43]:
type(china_df[['pop']])

pandas.core.frame.DataFrame

In [44]:
# 获取前三行的数据
china_df[0:3]

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,China,Asia,44.0,556263527,400.448611
1957,China,Asia,50.54896,637408000,575.987001
1962,China,Asia,44.50136,665770000,487.674018


In [45]:
# 从第一行开始，每隔一行获取一行数据，一共获取3行[起始位置，终止位置，步长]
china_df[0:5:2]

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,China,Asia,44.0,556263527,400.448611
1962,China,Asia,44.50136,665770000,487.674018
1972,China,Asia,63.11888,862030000,676.900092
