# 05 pandas 数值计算和统计基础

常用数学、统计方法

## 基本内容

In [1]:
# 导入numpy、pandas模块
import numpy as np
import pandas as pd  

In [2]:
# 基本参数：axis、skipna

import numpy as np
import pandas as pd

df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
                 'key2':[1,2,np.nan,4,5],
                 'key3':[1,2,3,'j','k']},
                 index = ['a','b','c','d','e'])
print(df)
print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)
print('-----')

m1 = df.mean()
print(m1,type(m1))
print('单独统计一列:',df['key2'].mean())
print('-----')
# np.nan ：空值
# .mean()计算均值
# 只统计数字列
# 可以通过索引单独统计一列

m2 = df.mean(axis=1)
print(m2)
print('-----')
# axis参数：默认为0，以列来计算，axis=1，以行来计算，这里就按照行来汇总了

m3 = df.mean(skipna=False)
print(m3)
print('-----')
# skipna参数：是否忽略NaN，默认True，如False，有NaN的列统计结果仍未NaN

   key1  key2 key3
a   4.0   1.0    1
b   5.0   2.0    2
c   3.0   NaN    3
d   NaN   4.0    j
e   2.0   5.0    k
float64 float64 object
-----
key1    3.5
key2    3.0
dtype: float64 <class 'pandas.core.series.Series'>
单独统计一列: 3.0
-----
a    2.5
b    3.5
c    3.0
d    4.0
e    3.5
dtype: float64
-----
key1   NaN
key2   NaN
dtype: float64
-----


In [3]:
# 主要数学计算方法，可用于Series和DataFrame（1）

df = pd.DataFrame({'key1':np.arange(10),
                  'key2':np.random.rand(10)*10})
print(df)
print('-----')

print(df.count(),'→ count统计非Na值的数量\n')
print(df.min(),'→ min统计最小值\n',df['key2'].max(),'→ max统计最大值\n')
print(df.quantile(q=0.75),'→ quantile统计分位数，参数q确定位置\n')
print(df.sum(),'→ sum求和\n')
print(df.mean(),'→ mean求平均值\n')
print(df.median(),'→ median求算数中位数，50%分位数\n')
print(df.std(),'\n',df.var(),'→ std,var分别求标准差，方差\n')
print(df.skew(),'→ skew样本的偏度\n')
print(df.kurt(),'→ kurt样本的峰度\n')

   key1      key2
0     0  1.750904
1     1  6.379359
2     2  7.494032
3     3  5.988620
4     4  2.956186
5     5  5.338486
6     6  2.911337
7     7  6.480888
8     8  2.710119
9     9  4.645389
-----
key1    10
key2    10
dtype: int64 → count统计非Na值的数量

key1    0.000000
key2    1.750904
dtype: float64 → min统计最小值
 7.49403185628855 → max统计最大值

key1    6.750000
key2    6.281674
Name: 0.75, dtype: float64 → quantile统计分位数，参数q确定位置

key1    45.000000
key2    46.655318
dtype: float64 → sum求和

key1    4.500000
key2    4.665532
dtype: float64 → mean求平均值

key1    4.500000
key2    4.991937
dtype: float64 → median求算数中位数，50%分位数

key1    3.027650
key2    1.965105
dtype: float64 
 key1    9.166667
key2    3.861636
dtype: float64 → std,var分别求标准差，方差

key1    0.000000
key2   -0.116454
dtype: float64 → skew样本的偏度

key1   -1.200000
key2   -1.535686
dtype: float64 → kurt样本的峰度



In [4]:
# 主要数学计算方法，可用于Series和DataFrame（2）

df['key1_s'] = df['key1'].cumsum()
df['key2_s'] = df['key2'].cumsum()
print(df,'→ cumsum样本的累计和\n')

df['key1_p'] = df['key1'].cumprod()
df['key2_p'] = df['key2'].cumprod()
print(df,'→ cumprod样本的累计积\n')

print(df.cummax(),'\n',df.cummin(),'→ cummax,cummin分别求累计最大值，累计最小值\n')
# 会填充key1，和key2的值

   key1      key2  key1_s     key2_s
0     0  1.750904       0   1.750904
1     1  6.379359       1   8.130263
2     2  7.494032       3  15.624294
3     3  5.988620       6  21.612914
4     4  2.956186      10  24.569100
5     5  5.338486      15  29.907585
6     6  2.911337      21  32.818922
7     7  6.480888      28  39.299810
8     8  2.710119      36  42.009929
9     9  4.645389      45  46.655318 → cumsum样本的累计和

   key1      key2  key1_s     key2_s  key1_p        key2_p
0     0  1.750904       0   1.750904       0  1.750904e+00
1     1  6.379359       1   8.130263       0  1.116964e+01
2     2  7.494032       3  15.624294       0  8.370567e+01
3     3  5.988620       6  21.612914       0  5.012814e+02
4     4  2.956186      10  24.569100       0  1.481881e+03
5     5  5.338486      15  29.907585       0  7.911000e+03
6     6  2.911337      21  32.818922       0  2.303159e+04
7     7  6.480888      28  39.299810       0  1.492651e+05
8     8  2.710119      36  42.009929       0  

In [5]:
# 唯一值：.unique()

s = pd.Series(list('asdvasdcfgg'))
sq = s.unique()
print(s)
print(sq,type(sq))
print(pd.Series(sq))
# 得到一个唯一值数组
# 通过pd.Series重新变成新的Series

sq.sort()
print(sq)
# 重新排序

0     a
1     s
2     d
3     v
4     a
5     s
6     d
7     c
8     f
9     g
10    g
dtype: object
['a' 's' 'd' 'v' 'c' 'f' 'g'] <class 'numpy.ndarray'>
0    a
1    s
2    d
3    v
4    c
5    f
6    g
dtype: object
['a' 'c' 'd' 'f' 'g' 's' 'v']


In [6]:
# 值计数：.value_counts()

sc = s.value_counts(sort = False)  # 也可以这样写：pd.value_counts(sc, sort = False)
print(sc)
# 得到一个新的Series，计算出不同值出现的频率
# sort参数：排序，默认为True

v    1
c    1
f    1
a    2
s    2
d    2
g    2
dtype: int64


In [7]:
# 成员资格：.isin()

s = pd.Series(np.arange(10,15))
df = pd.DataFrame({'key1':list('asdcbvasd'),
                  'key2':np.arange(4,13)})
print(s)
print(df)
print('-----')

print(s.isin([5,14]))
print(df.isin(['a','bc','10',8]))
# 用[]表示
# 得到一个布尔值的Series或者Dataframe

0    10
1    11
2    12
3    13
4    14
dtype: int32
  key1  key2
0    a     4
1    s     5
2    d     6
3    c     7
4    b     8
5    v     9
6    a    10
7    s    11
8    d    12
-----
0    False
1    False
2    False
3    False
4     True
dtype: bool
    key1   key2
0   True  False
1  False  False
2  False  False
3  False  False
4  False   True
5  False  False
6   True  False
7  False  False
8  False  False


### 作业

### 作业1：如图创建一个Dataframe（5*2，值为0-100的随机值），并分别计算key1和key2的均值、中位数、累积和

In [8]:
df1 = pd.DataFrame(np.random.rand(5,2)*100, columns=['key1','key2'])
print(df1['key1'].mean(), df1['key1'].median())
print(df1['key2'].mean(), df1['key2'].median())
df1['key1_cumsum'] = df1['key1'].cumsum()
df1['key2_cumsum'] = df1['key2'].cumsum()
print(df1)

37.332902272755504 12.003375259968296
63.97580470718329 66.17841765812062
        key1       key2  key1_cumsum  key2_cumsum
0   9.005451  66.178418     9.005451    66.178418
1  12.003375  57.691092    21.008826   123.869509
2  63.256502  53.732934    84.265329   177.602443
3  10.418541  69.457312    94.683869   247.059755
4  91.980642  72.819268   186.664511   319.879024


### 作业2：写出一个输入元素直接生成数组的代码块，然后创建一个函数，该函数功能用于判断一个Series是否是唯一值数组，返回“是”和“不是”

In [9]:
ip = eval(input('请输入一组元素，以列表形式'))

请输入一组元素，以列表形式[1,3,4,6,7,8,3]


In [10]:
s = pd.Series(ip)

In [11]:
def f(s):
    s1 = s.unique()
    if len(s1)==len(s):
        print('该数据是唯一值数组！')
    else:
        print('该数组不是唯一值数组！')

In [12]:
f(s)

该数组不是唯一值数组！
