In [2]:
import numpy as np
import pandas as pd

## 索引器概念

### 表的列索引

In [3]:
# 语法：通过[列名],可以取出DF中相应的列
# 返回值：Series
df = pd.read_csv('data/learn_pandas.csv',
                usecols=['School','Grade','Name','Gender','Weight','Transfer'])


In [4]:
df.head()

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
0,A,Freshman,Gaopeng Yang,Female,46.0,N
1,B,Freshman,Changqiang You,Male,70.0,N
2,A,Senior,Mei Sun,Male,89.0,N
3,C,Sophomore,Xiaojuan Sun,Female,41.0,N
4,C,Sophomore,Gaojuan You,Male,74.0,N


In [5]:
df['Name'].head()
# 返回的是Series

0      Gaopeng Yang
1    Changqiang You
2           Mei Sun
3      Xiaojuan Sun
4       Gaojuan You
Name: Name, dtype: object

In [6]:
# 取出多个列
# 语法：[列明组成的列表]
# 返回值：一个DF
df[['Gender','Name']].head()

Unnamed: 0,Gender,Name
0,Female,Gaopeng Yang
1,Male,Changqiang You
2,Male,Mei Sun
3,Female,Xiaojuan Sun
4,Male,Gaojuan You


In [7]:
df.Name.head()
# df[列名] 等价于df.列名

0      Gaopeng Yang
1    Changqiang You
2           Mei Sun
3      Xiaojuan Sun
4       Gaojuan You
Name: Name, dtype: object

In [10]:
df.Gender.head()

0    Female
1      Male
2      Male
3    Female
4      Male
Name: Gender, dtype: object

### 序列的行索引

In [15]:
# 1、以字符串为索引的Series
# 语法：[索引名]
s = pd.Series([1,2,3,4,5,6],
             index=['a','b','a','a','a','c'])
s

a    1
b    2
a    3
a    4
a    5
c    6
dtype: int64

In [16]:
s['a']

a    1
a    3
a    4
a    5
dtype: int64

In [17]:
s['b']

np.int64(2)

In [18]:
# 语法：[索引名组成的列表]
s[['c','b']]

c    6
b    2
dtype: int64

In [19]:
# 需求：取出两个元素之间的元素
# 使用切片
# 起始位和结束位都包含
s['c':'b':-2]

c    6
a    4
b    2
dtype: int64

In [20]:
# 2、以整数为索引的Series
# 语法格式：[整数]
s = pd.Series(['a','b','c','d','e','f'],
             index=[1,3,1,2,5,4])

In [21]:
s[1]

1    a
1    c
dtype: object

In [22]:
# 语法格式：[整数组成的列表]
s[[2,3]]

2    d
3    b
dtype: object

In [26]:
# 切片
s[1:5:2]
# s[1:-1:2]

3    b
2    d
dtype: object

In [27]:
# 注意：
# 1、索引的设置不建议使用纯浮点数以及混合类型（字符串、整数、布尔、浮点数）
# 2、如果设置的话，在操作数据框时可能会报错或者达到非预期结果。

### 索引器

In [28]:
# pandas索引器有三类：
#     1、loc索引器
#     2、iloc索引器
#     3、[]

In [29]:
# 1、loc索引器
# loc[*,*],第一个*表示的行的选择，第二个*表示的是列的选择
# loc[*],*可以表示五类合法对象：单个元素、元素的列表、元素的切片、布尔列表、函数

In [30]:
df

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
0,A,Freshman,Gaopeng Yang,Female,46.0,N
1,B,Freshman,Changqiang You,Male,70.0,N
2,A,Senior,Mei Sun,Male,89.0,N
3,C,Sophomore,Xiaojuan Sun,Female,41.0,N
4,C,Sophomore,Gaojuan You,Male,74.0,N
...,...,...,...,...,...,...
195,C,Junior,Xiaojuan Sun,Female,46.0,N
196,D,Senior,Li Zhao,Female,50.0,N
197,A,Senior,Chengqiang Chu,Female,45.0,N
198,A,Senior,Chengmei Shen,Male,71.0,N


In [31]:
# set_index()设置索引
df_demo = df.set_index('Name')
df_demo.head()

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gaopeng Yang,A,Freshman,Female,46.0,N
Changqiang You,B,Freshman,Male,70.0,N
Mei Sun,A,Senior,Male,89.0,N
Xiaojuan Sun,C,Sophomore,Female,41.0,N
Gaojuan You,C,Sophomore,Male,74.0,N


In [35]:
# 1、单个元素（标签）,选择的是一行内容
df_demo.loc['Gaopeng Yang']

School             A
Grade       Freshman
Gender        Female
Weight          46.0
Transfer           N
Name: Gaopeng Yang, dtype: object

In [36]:
df_demo.loc['Qiang Chu']

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Qiang Chu,A,Freshman,Female,52.0,N
Qiang Chu,A,Senior,Female,50.0,N


In [38]:
#2、 使用列表 行
df_demo.loc[['Qiang Chu','Peng Wang']]

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Qiang Chu,A,Freshman,Female,52.0,N
Qiang Chu,A,Senior,Female,50.0,N
Peng Wang,D,Junior,Male,65.0,N
Peng Wang,D,Senior,Male,73.0,N


In [40]:
# 使用loc索引器选择行和列
df_demo.loc[['Qiang Chu'],['School']]

Unnamed: 0_level_0,School
Name,Unnamed: 1_level_1
Qiang Chu,A
Qiang Chu,A


In [42]:
# 3、标签的切片
df_demo.loc['Gaopeng Yang':'Gaojuan You']

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gaopeng Yang,A,Freshman,Female,46.0,N
Changqiang You,B,Freshman,Male,70.0,N
Mei Sun,A,Senior,Male,89.0,N
Xiaojuan Sun,C,Sophomore,Female,41.0,N
Gaojuan You,C,Sophomore,Male,74.0,N


In [44]:
# 4、布尔类型的数组
df_demo.loc[df_demo.Weight>50].head()

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Changqiang You,B,Freshman,Male,70.0,N
Mei Sun,A,Senior,Male,89.0,N
Gaojuan You,C,Sophomore,Male,74.0,N
Xiaoli Qian,D,Freshman,Female,51.0,N
Qiang Chu,A,Freshman,Female,52.0,N


In [45]:
# 5、可调用函数
df_demo.loc[lambda df_demo:df_demo.Weight>50].head()

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Changqiang You,B,Freshman,Male,70.0,N
Mei Sun,A,Senior,Male,89.0,N
Gaojuan You,C,Sophomore,Male,74.0,N
Xiaoli Qian,D,Freshman,Female,51.0,N
Qiang Chu,A,Freshman,Female,52.0,N


### iloc索引器

In [46]:
# iloc和loc完全类似，只不过是针对位置进行筛选，在相应的*号位置处一共也有五类合法对象：
# 整数、整数列表、整数的切片、布尔列表、函数
# 语法：iloc[*,*]
# iloc[*]

In [48]:
df_demo.head()

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gaopeng Yang,A,Freshman,Female,46.0,N
Changqiang You,B,Freshman,Male,70.0,N
Mei Sun,A,Senior,Male,89.0,N
Xiaojuan Sun,C,Sophomore,Female,41.0,N
Gaojuan You,C,Sophomore,Male,74.0,N


In [49]:
# 1、使用整数
df_demo.iloc[10]  # 第十一行内容

School             A
Grade       Freshman
Gender          Male
Weight          74.0
Transfer           N
Name: Xiaopeng Zhou, dtype: object

In [50]:
# 第二行第二列
df_demo.iloc[1,1]

'Freshman'

In [51]:
# 前两行和前两列
df_demo.iloc[[0,1],[0,1]]

Unnamed: 0_level_0,School,Grade
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Gaopeng Yang,A,Freshman
Changqiang You,B,Freshman


In [52]:
# 2、使用列表或数组
#列表
df_demo.iloc[[0,1]]

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gaopeng Yang,A,Freshman,Female,46.0,N
Changqiang You,B,Freshman,Male,70.0,N


In [53]:
# 数组
df_demo.iloc[np.array([0,2,4])]

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gaopeng Yang,A,Freshman,Female,46.0,N
Mei Sun,A,Senior,Male,89.0,N
Gaojuan You,C,Sophomore,Male,74.0,N


In [55]:
# 3、元素为整数的切片对象
df_demo.iloc[1:4]

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Changqiang You,B,Freshman,Male,70.0,N
Mei Sun,A,Senior,Male,89.0,N
Xiaojuan Sun,C,Sophomore,Female,41.0,N


In [56]:
# iloc[切片1，切片2]
# 切片1表示的是行起始位置和结束位置（结束位置不能取到）
# 切片2表示的列起始位置和结束位置（结束位置不能取到）
# 注意：iloc[*],可以使用list、array，不能使用series
df_demo.iloc[1:4,2:4]


Unnamed: 0_level_0,Gender,Weight
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Changqiang You,Male,70.0
Mei Sun,Male,89.0
Xiaojuan Sun,Female,41.0


In [59]:
# 4、使用布尔数组进行筛选
df_demo.iloc[np.array(df_demo.Weight>50)].head()

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Changqiang You,B,Freshman,Male,70.0,N
Mei Sun,A,Senior,Male,89.0,N
Gaojuan You,C,Sophomore,Male,74.0,N
Xiaoli Qian,D,Freshman,Female,51.0,N
Qiang Chu,A,Freshman,Female,52.0,N


In [60]:
df_demo.iloc[list(df_demo.Weight>50)].head()

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Changqiang You,B,Freshman,Male,70.0,N
Mei Sun,A,Senior,Male,89.0,N
Gaojuan You,C,Sophomore,Male,74.0,N
Xiaoli Qian,D,Freshman,Female,51.0,N
Qiang Chu,A,Freshman,Female,52.0,N


In [65]:
# s = pd.Series([60])
# s

0    60
dtype: int64

In [68]:
# df_demo.iloc[s>50].head()

In [71]:
# 5、使用函数
# 在外面定义一个函数（返回值是list或者array）
df_demo.iloc[lambda df_demo:[0,2]]

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gaopeng Yang,A,Freshman,Female,46.0,N
Mei Sun,A,Senior,Male,89.0,N


In [75]:
df_demo.iloc[lambda df_demo:slice(1,4)]

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Changqiang You,B,Freshman,Male,70.0,N
Mei Sun,A,Senior,Male,89.0,N
Xiaojuan Sun,C,Sophomore,Female,41.0,N


### 切片的操作[]

In [76]:
df_demo[1:4]

Unnamed: 0_level_0,School,Grade,Gender,Weight,Transfer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Changqiang You,B,Freshman,Male,70.0,N
Mei Sun,A,Senior,Male,89.0,N
Xiaojuan Sun,C,Sophomore,Female,41.0,N


### query方法

In [77]:
df

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
0,A,Freshman,Gaopeng Yang,Female,46.0,N
1,B,Freshman,Changqiang You,Male,70.0,N
2,A,Senior,Mei Sun,Male,89.0,N
3,C,Sophomore,Xiaojuan Sun,Female,41.0,N
4,C,Sophomore,Gaojuan You,Male,74.0,N
...,...,...,...,...,...,...
195,C,Junior,Xiaojuan Sun,Female,46.0,N
196,D,Senior,Li Zhao,Female,50.0,N
197,A,Senior,Chengqiang Chu,Female,45.0,N
198,A,Senior,Chengmei Shen,Male,71.0,N


In [78]:
df.query('Weight > Weight.mean()').head()

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
1,B,Freshman,Changqiang You,Male,70.0,N
2,A,Senior,Mei Sun,Male,89.0,N
4,C,Sophomore,Gaojuan You,Male,74.0,N
10,A,Freshman,Xiaopeng Zhou,Male,74.0,N
14,D,Senior,Xiaomei Zhou,Female,57.0,N


In [79]:
# query方法中引用带空格的列明
    # 对于带空格的列明，需要使用 (反引号) `col name`的方式进行引用 

In [80]:
# 筛选出男生中不是大一大二的学生
# 逻辑运算符： or、and、 in 、not in
df.query('(Grade not in ["Freshman","Sophomore"]) and (Gender=="Male")').head()

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
2,A,Senior,Mei Sun,Male,89.0,N
16,D,Junior,Xiaoqiang Qin,Male,68.0,N
17,D,Junior,Peng Wang,Male,65.0,N
18,D,Senior,Xiaofeng Sun,Male,71.0,N
21,A,Senior,Xiaopeng Shen,Male,62.0,


In [81]:
# is in 和 not in
# is in等价于==
# not in 等价于 !=
# 查询所有大三和大四的学生
df.query('Grade==["Senior","Junior"]').head()

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
2,A,Senior,Mei Sun,Male,89.0,N
7,D,Junior,Gaoqiang Qian,Female,50.0,N
9,B,Junior,Juan Xu,Female,,N
11,D,Junior,Xiaoquan Lv,Female,43.0,N
12,A,Senior,Peng You,Female,48.0,


In [82]:
# 查询体重在70-80之间
a = 70
b = 80
df.query('Weight.between(@a,@b)').head()

Unnamed: 0,School,Grade,Name,Gender,Weight,Transfer
1,B,Freshman,Changqiang You,Male,70.0,N
4,C,Sophomore,Gaojuan You,Male,74.0,N
10,A,Freshman,Xiaopeng Zhou,Male,74.0,N
18,D,Senior,Xiaofeng Sun,Male,71.0,N
35,B,Freshman,Gaoli Zhao,Male,78.0,N


### 随机抽样

In [86]:
# sample()随机抽样
df_sample = pd.DataFrame({'id':list('abcde'),'value':[1,2,3,4,80]})
df_sample.value

0     1
1     2
2     3
3     4
4    80
Name: value, dtype: int64

In [94]:
df_sample.sample(3,replace=True,weights=df_smple.value)

Unnamed: 0,id,value
4,e,80
4,e,80
4,e,80
