# 04 pandas 基本技巧

数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序

## 基本内容

In [1]:
# 导入numpy、pandas模块
import numpy as np
import pandas as pd  

In [2]:
# 数据查看、转置

df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,
                   columns = ['a','b'])
print(df.head(2))
print(df.tail())
# .head()查看头部数据
# .tail()查看尾部数据
# 默认查看5条

print(df.T)
# .T 转置

           a          b
0  23.667816  85.546456
1  69.206791  68.451338
           a          b
3  36.175649  11.454312
4  77.892695  19.067904
5   9.313125  17.977894
6  88.748446  48.234666
7  28.820127  24.617214
           0          1          2          3          4          5  \
a  23.667816  69.206791  70.318925  36.175649  77.892695   9.313125   
b  85.546456  68.451338  24.131138  11.454312  19.067904  17.977894   

           6          7  
a  88.748446  28.820127  
b  48.234666  24.617214  


In [3]:
# 添加与修改

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df)

df['e'] = 10
df.loc[4] = 20
print(df)
# 新增列/行并赋值

df['e'] = 20
df[['a','c']] = 100
print(df)
# 索引后直接修改值

           a          b          c          d
0  11.883015  28.503518  23.384110  35.610315
1  58.052459  78.688371  24.398117  83.478582
2  45.085453  31.056277  30.149883  94.072398
3   5.862394  71.899844  60.744144  25.910074
           a          b          c          d   e
0  11.883015  28.503518  23.384110  35.610315  10
1  58.052459  78.688371  24.398117  83.478582  10
2  45.085453  31.056277  30.149883  94.072398  10
3   5.862394  71.899844  60.744144  25.910074  10
4  20.000000  20.000000  20.000000  20.000000  20
     a          b    c          d   e
0  100  28.503518  100  35.610315  20
1  100  78.688371  100  83.478582  20
2  100  31.056277  100  94.072398  20
3  100  71.899844  100  25.910074  20
4  100  20.000000  100  20.000000  20


In [4]:
# 删除  del / drop()

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df)

del df['a']
print(df)
print('-----')
# del语句 - 删除列

print(df.drop(0))
print(df.drop([1,2]))
print(df)
print('-----')
# drop()删除行，inplace=False → 删除后生成新的数据，不改变原数据

print(df.drop(['d'], axis = 1))
print(df)
# drop()删除列，需要加上axis = 1，inplace=False → 删除后生成新的数据，不改变原数据

           a          b          c          d
0  20.103831  46.402110  68.629604  21.079644
1   4.348753  69.922850  19.530231  96.098871
2   3.528685   1.452172   2.227252  98.676276
3  40.982567  84.836507   5.242610   1.060756
           b          c          d
0  46.402110  68.629604  21.079644
1  69.922850  19.530231  96.098871
2   1.452172   2.227252  98.676276
3  84.836507   5.242610   1.060756
-----
           b          c          d
1  69.922850  19.530231  96.098871
2   1.452172   2.227252  98.676276
3  84.836507   5.242610   1.060756
           b          c          d
0  46.402110  68.629604  21.079644
3  84.836507   5.242610   1.060756
           b          c          d
0  46.402110  68.629604  21.079644
1  69.922850  19.530231  96.098871
2   1.452172   2.227252  98.676276
3  84.836507   5.242610   1.060756
-----
           b          c
0  46.402110  68.629604
1  69.922850  19.530231
2   1.452172   2.227252
3  84.836507   5.242610
           b          c          d
0  46.40

In [5]:
# 对齐

df1 = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
print(df1 + df2)
# DataFrame对象之间的数据自动按照列和索引（行标签）对齐

          A         B         C   D
0  1.899736  1.222988  0.419859 NaN
1  0.103848  0.201543 -0.037905 NaN
2 -0.116430 -2.182946  1.283301 NaN
3 -0.369550 -1.204594 -0.331058 NaN
4  0.122192  0.661019  1.268409 NaN
5  3.011971  0.055914  0.083011 NaN
6 -0.710937  0.623166 -0.591240 NaN
7       NaN       NaN       NaN NaN
8       NaN       NaN       NaN NaN
9       NaN       NaN       NaN NaN


In [6]:
# 排序1 - 按值排序 .sort_values
# 同样适用于Series

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df1)
print(df1.sort_values(['a'], ascending = True))  # 升序
print(df1.sort_values(['a'], ascending = False))  # 降序
print('------')
# ascending参数：设置升序降序，默认升序
# 单列排序

df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2],
                  'b':list(range(8)),
                  'c':list(range(8,0,-1))})
print(df2)
print(df2.sort_values(['a','c']))
# 多列排序，按列顺序排序
# 注意inplace参数

           a          b          c          d
0  21.016090  54.057577  69.943746  56.493138
1  15.455375  12.932406  79.751035  62.098635
2  36.132841  70.221348   1.169797  66.808895
3  91.335756  78.605851   5.284213  11.318597
           a          b          c          d
1  15.455375  12.932406  79.751035  62.098635
0  21.016090  54.057577  69.943746  56.493138
2  36.132841  70.221348   1.169797  66.808895
3  91.335756  78.605851   5.284213  11.318597
           a          b          c          d
3  91.335756  78.605851   5.284213  11.318597
2  36.132841  70.221348   1.169797  66.808895
0  21.016090  54.057577  69.943746  56.493138
1  15.455375  12.932406  79.751035  62.098635
------
   a  b  c
0  1  0  8
1  1  1  7
2  1  2  6
3  1  3  5
4  2  4  4
5  2  5  3
6  2  6  2
7  2  7  1
   a  b  c
3  1  3  5
2  1  2  6
1  1  1  7
0  1  0  8
7  2  7  1
6  2  6  2
5  2  5  3
4  2  4  4


In [7]:
# 排序2 - 索引排序 .sort_index

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                  index = [5,4,3,2],
                   columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                  index = ['h','s','x','g'],
                   columns = ['a','b','c','d'])
print(df1)
print(df1.sort_index())
print(df2)
print(df2.sort_index())
# 按照index排序
# 默认 ascending=True, inplace=False

           a          b          c          d
5   7.922366  98.715986  83.989986  35.385060
4   8.900585  68.580206  25.971262  87.485385
3  17.662359  19.903188  94.735508  90.181491
2  33.961367  27.635949  48.913265  19.116330
           a          b          c          d
2  33.961367  27.635949  48.913265  19.116330
3  17.662359  19.903188  94.735508  90.181491
4   8.900585  68.580206  25.971262  87.485385
5   7.922366  98.715986  83.989986  35.385060
           a          b          c          d
h  80.830840  45.999325  64.287530  15.223440
s  57.453558  49.725231  59.111570  46.513960
x  51.405235  15.136040  83.181867  79.350228
g  41.715948  75.492101   2.086401  56.425668
           a          b          c          d
g  41.715948  75.492101   2.086401  56.425668
h  80.830840  45.999325  64.287530  15.223440
s  57.453558  49.725231  59.111570  46.513960
x  51.405235  15.136040  83.181867  79.350228


## 作业

### 作业1：创建一个3*3，值在0-100区间随机值的Dataframe（如图），分别按照index和第二列值大小，降序排序


In [8]:
df1 = pd.DataFrame(np.random.rand(3,3)*100, columns=['v1','v2','v3'], index=['a','b','c'])
print(df1)
print(df1.sort_index())
print(df1.sort_values(['v2'], ascending = False))

          v1         v2         v3
a  83.479834  61.203842  23.261490
b  35.696331  18.713394  10.902280
c  16.897430  11.357244  41.301348
          v1         v2         v3
a  83.479834  61.203842  23.261490
b  35.696331  18.713394  10.902280
c  16.897430  11.357244  41.301348
          v1         v2         v3
a  83.479834  61.203842  23.261490
b  35.696331  18.713394  10.902280
c  16.897430  11.357244  41.301348


### 作业2：创建一个5*2，值在0-100区间随机值的Dataframe（如图）df1，通过修改得到df2


In [9]:
df1 = pd.DataFrame(np.random.rand(5,2)*100, index=['a','b','c','d','e'], columns=['v1','v2'])
df1 = df1.T
df1['b'] = 100
del df1['e']
df1

Unnamed: 0,a,b,c,d
v1,74.384132,100,25.563289,6.690713
v2,7.548339,100,56.105458,4.250353


### 作业3：如图创建Series，并按照要求修改得到结果

In [10]:
s = pd.Series(np.arange(10),index=('a','b','c','d','e','f','g','h','i','j'))
s.loc[['a','e','f']] = 100
s

a    100
b      1
c      2
d      3
e    100
f    100
g      6
h      7
i      8
j      9
dtype: int32

### 作业4：已有s1，s2（值为0-10的随机数），请求出s1+s2的值
* s1 = pd.Series(np.random.rand(5)*10,index = list('abcde'))
* s2 = pd.Series(np.random.rand(5)*10,index = list('cdefg'))

In [11]:
s1 = pd.Series(np.random.rand(5)*10,index = list('abcde'))
s2 = pd.Series(np.random.rand(5)*10,index = list('cdefg'))
s1+s2

a          NaN
b          NaN
c     7.372460
d    10.877289
e    19.445449
f          NaN
g          NaN
dtype: float64