# Pandas练习题

Pandas 是基于 NumPy 的一种数据处理工具，该工具为了解决数据分析任务而创建。Pandas 纳入了大量库和一些标准的数据模型，提供了高效地操作大型数据集所需的函数和方法。 这些练习着重DataFrame和Series对象的基本操作，包括数据的索引、分组、统计和清洗。

## 基本操作
### 1.导入 Pandas 库并简写为 pd，并输出版本号

In [85]:
import pandas as pd
pd.__version__

'1.2.4'

### 2. 从列表创建 Series

In [86]:
arr = [0, 1, 2, 3, 4]
df = pd.Series(arr) # 如果不指定索引，则默认从 0 开始
df

0    0
1    1
2    2
3    3
4    4
dtype: int64

### 3. 从字典创建 Series

In [87]:
d = {'a':1,'b':2,'c':3,'d':4,'e':5}
df = pd.Series(d)
df

a    1
b    2
c    3
d    4
e    5
dtype: int64

### 4. 从 NumPy 数组创建 DataFrame

In [88]:
import numpy as np
dates = pd.date_range('today', periods=6)  # 定义时间序列作为 index
num_arr = np.random.randn(6, 4)  # 传入 numpy 随机数组
columns = ['A', 'B', 'C', 'D']  # 将列表作为列名
df = pd.DataFrame(num_arr, index=dates, columns=columns)
df

Unnamed: 0,A,B,C,D
2022-09-05 10:43:51.011690,-1.141681,-1.753285,0.158898,0.792121
2022-09-06 10:43:51.011690,-1.603811,0.275445,-0.358538,0.382201
2022-09-07 10:43:51.011690,-0.496317,0.064608,-0.235369,-0.543569
2022-09-08 10:43:51.011690,-0.176868,-0.178488,0.34282,2.182515
2022-09-09 10:43:51.011690,-0.541293,-0.696388,-0.769961,-0.187213
2022-09-10 10:43:51.011690,-1.502881,0.953043,-0.237399,1.919471


### 5. 从CSV中创建 DataFrame，分隔符为“；”，编码格式为gbk

### 6. 从字典对象创建DataFrame，并设置索引

In [89]:
import numpy as np
data = {
    'animal':
    ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
    'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
    'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
    'priority':
    ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index=labels)
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


### 7. 显示df的基础信息，包括行的数量；列名；每一列值的数量、类型

In [90]:
df.info()
# 方法二
# df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   animal    10 non-null     object 
 1   age       8 non-null      float64
 2   visits    10 non-null     int64  
 3   priority  10 non-null     object 
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes


### 8. 展示df的前3行

In [91]:
df.iloc[:3]
# 方法二
#df.head(3)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


### 9. 取出df的animal和age列

In [92]:
df.loc[:, ['animal', 'age']]
# 方法二
# df[['animal', 'age']]

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


### 10. 取出索引为[3, 4, 8]行的animal和age列

In [93]:
df.loc[df.index[[3, 4, 8]], ['animal', 'age']]

Unnamed: 0,animal,age
d,dog,
e,dog,5.0
i,dog,7.0


### 11. 取出age值大于3的行

In [94]:
df[df['age'] > 3]

Unnamed: 0,animal,age,visits,priority
e,dog,5.0,2,no
g,snake,4.5,1,no
i,dog,7.0,2,no


### 12. 取出age值缺失的行

In [95]:
df[df['age'].isnull()]

Unnamed: 0,animal,age,visits,priority
d,dog,,3,yes
h,cat,,1,yes


### 13.取出age在2,4间的行（不含）

In [96]:
df[(df['age']>2) & (df['age']>4)]
# 方法二
# df[df['age'].between(2, 4)]

Unnamed: 0,animal,age,visits,priority
e,dog,5.0,2,no
g,snake,4.5,1,no
i,dog,7.0,2,no


### 14. f行的age改为1.5

In [97]:
df.loc['f', 'age'] = 1.5

### 15. 计算visits的总和

In [98]:
df['visits'].sum()

19

### 16. 计算每个不同种类animal的age的平均数

In [99]:
df.groupby('animal')['age'].mean()

animal
cat      2.333333
dog      5.000000
snake    2.500000
Name: age, dtype: float64

### 17. 在df中插入新行k，然后删除该行

In [100]:
#插入
df.loc['k'] = [5.5, 'dog', 'no', 2]
# 删除
df = df.drop('k')
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


### 18. 计算df中每个种类animal的数量

In [101]:
df['animal'].value_counts()

dog      4
cat      4
snake    2
Name: animal, dtype: int64

### 19. 先按age降序排列，后按visits升序排列

In [102]:
df.sort_values(by=['age', 'visits'], ascending=[False, True])

Unnamed: 0,animal,age,visits,priority
i,dog,7.0,2,no
e,dog,5.0,2,no
g,snake,4.5,1,no
j,dog,3.0,1,no
b,cat,3.0,3,yes
a,cat,2.5,1,yes
f,cat,1.5,3,no
c,snake,0.5,2,no
h,cat,,1,yes
d,dog,,3,yes


### 20. 将priority列中的yes, no替换为布尔值True, False

In [25]:
df['priority'] = df['priority'].map({'yes': True, 'no': False})
df

Unnamed: 0,age,animal,priority,visits
a,2.5,cat,True,1
b,3.0,cat,True,3
c,0.5,snake,False,2
d,,dog,True,3
e,5.0,dog,False,2
f,1.5,cat,False,3
g,4.5,snake,False,1
h,,cat,True,1
i,7.0,dog,False,2
j,3.0,dog,False,1


### 21. 将animal列中的snake替换为python

In [26]:
df['animal'] = df['animal'].replace('snake', 'python')
df

Unnamed: 0,age,animal,priority,visits
a,2.5,cat,True,1
b,3.0,cat,True,3
c,0.5,python,False,2
d,,dog,True,3
e,5.0,dog,False,2
f,1.5,cat,False,3
g,4.5,python,False,1
h,,cat,True,1
i,7.0,dog,False,2
j,3.0,dog,False,1


### 22. 对每种animal的每种不同数量visits，计算平均age，即，返回一个表格，行是aniaml种类，列是visits数量，表格值是行动物种类列访客数量的平均年龄

In [27]:
df.pivot_table(index='animal', columns='visits', values='age', aggfunc='mean')

visits,1,2,3
animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,2.5,,2.25
dog,3.0,6.0,
python,4.5,0.5,


### 进阶操作
### 23. 有一列整数列A的DatraFrame，删除数值重复的行

In [103]:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
print(df)
df1 = df.loc[df['A'].shift() != df['A']]
# 方法二
# df1 = df.drop_duplicates(subset='A')
print(df1)

    A
0   1
1   2
2   2
3   3
4   4
5   5
6   5
7   5
8   6
9   7
10  7
   A
0  1
1  2
3  3
4  4
5  5
8  6
9  7


### 24. 一个全数值DatraFrame，每个数字减去该行的平均数

In [104]:
df = pd.DataFrame(np.random.random(size=(5, 3)))
print(df)
df1 = df.sub(df.mean(axis=1), axis=0)
print(df1)

          0         1         2
0  0.171671  0.246876  0.304267
1  0.291528  0.830821  0.977031
2  0.994551  0.794925  0.164927
3  0.487579  0.464208  0.142728
4  0.175886  0.014870  0.556099
          0         1         2
0 -0.069267  0.005938  0.063329
1 -0.408265  0.131028  0.277237
2  0.343083  0.143457 -0.486540
3  0.122741  0.099370 -0.222111
4 -0.073066 -0.234081  0.307147


### 25. 一个有5列的DataFrame，求哪一列的和最小

In [105]:
df = pd.DataFrame(np.random.random(size=(5, 5)), columns=list('abcde'))
print(df)
df.sum().idxmin()

          a         b         c         d         e
0  0.269002  0.137164  0.792372  0.690850  0.333632
1  0.442140  0.930907  0.703778  0.921814  0.413199
2  0.902949  0.452713  0.256924  0.127903  0.399467
3  0.648946  0.944327  0.910797  0.685240  0.967999
4  0.012510  0.463984  0.031464  0.605540  0.195207


'a'

### 创建一个函数，大于0.5返回True，否则返回False，并将它应用于数据集中第2列，将布尔值返回给名为panduan的新列

In [109]:
def majvv(x):
    if x > 0.5:
        return True
    else:
        return False
df = pd.DataFrame(np.random.random(size=(5, 3)))
print(df)
df['panduan'] = df[2].apply(majvv)
print(df)

          0         1         2
0  0.738596  0.022693  0.187638
1  0.489241  0.402224  0.373597
2  0.347196  0.265622  0.683297
3  0.409641  0.220607  0.976054
4  0.207122  0.145030  0.727297
          0         1         2  panduan
0  0.738596  0.022693  0.187638    False
1  0.489241  0.402224  0.373597    False
2  0.347196  0.265622  0.683297     True
3  0.409641  0.220607  0.976054     True
4  0.207122  0.145030  0.727297     True
