In [1]:
from utils import pprint_series, pprint_dataframes, pprint_pds

## Descriptive Statistics

Series 和 DataFrame 的描述性统计相关操作，都支持 `axis`、`skipna` 和 `level` 这三个参数。

axis默认值是0 或 "index" 可选值是0和1（index,columns） Literal[0, 1, "index", "columns"]
skipna默认值是True

归约
|Function |Desc                    |
|---------|------------------------|
|count()  |非 NA 观测值的数量|
|sum()    |值的总和|
|mean()   |值的均值，等同于`sum()/count()`|
|prod()   |值的乘积，NA 值会被当做`1.0`，可以通过`min_count=1`参数设置至少有一个非 NA 值，否则返回 NA|
|median() |值的算术中位值，等同于`quantile(0.5)`|
|min()    |最小值|
|max()    |最大值|

保持形状
|Function|Desc                                                |
|--------|----------------------------------------------------|
|mode()  |获取所选轴上的众数，即一组数中出现次数最多的值，可以是多个值|
|abs()   |绝对值|
|

In [5]:
import numpy as np
import pandas as pd

In [6]:
s1 = pd.Series([1, 3, 5, 7, 9])
s2 = pd.Series([2, 4, 6, np.nan, 10])

pprint_series(s1, s2)

Unnamed: 0,series1
0,1
1,3
2,5
3,7
4,9

Unnamed: 0,series2
0,2.0
1,4.0
2,6.0
3,
4,10.0


In [7]:
df1 = pd.DataFrame([[1, 3], [2, 4], [np.nan, np.nan], [7, np.nan], [np.nan, 9]], columns=list("AB"))
df2 = pd.DataFrame([[1, 3, 3], [1, 4, 4], [7, 4, 7], [7, 4, 4]], columns=list("ABC"))

pprint_dataframes(df1, df2)

Unnamed: 0,A,B
0,1.0,3.0
1,2.0,4.0
2,,
3,7.0,
4,,9.0

Unnamed: 0,A,B,C
0,1,3,3
1,1,4,4
2,7,4,7
3,7,4,4


In [69]:
# count
print(s1.count(), s2.count())
print("-" * 100)
pprint_series(df1.count(axis=0), df1.count(axis=1), df2.count(), df2.count(axis="columns"))

5 4
----------------------------------------------------------------------------------------------------


Unnamed: 0,series1
A,3
B,3

Unnamed: 0,series2
0,2
1,2
2,0
3,1
4,1

Unnamed: 0,series3
A,4
B,4
C,4

Unnamed: 0,series4
0,3
1,3
2,3
3,3


In [70]:
# sum
print(s1.sum(), s2.sum())
print("-" * 100)
pprint_series(df1.sum(), df1.sum(axis=1), df1.sum(skipna=False), df1.sum(axis=1, skipna=False))

25 22.0
----------------------------------------------------------------------------------------------------


Unnamed: 0,series1
A,10.0
B,16.0

Unnamed: 0,series2
0,4.0
1,6.0
2,0.0
3,7.0
4,9.0

Unnamed: 0,series3
A,
B,

Unnamed: 0,series4
0,4.0
1,6.0
2,
3,
4,


In [71]:
# mean
print(s1.mean(), s1.sum()/s1.count(), "|", s2.mean(), s2.sum()/s2.count())
print("-" * 100)
pprint_series(df1.mean(), df1.sum()/df1.count())
pprint_series(df1.mean(axis="columns"), df1.sum(axis="columns")/df1.count(axis="columns"))

5.0 5.0 | 5.5 5.5
----------------------------------------------------------------------------------------------------


Unnamed: 0,series1
A,3.333333
B,5.333333

Unnamed: 0,series2
A,3.333333
B,5.333333


Unnamed: 0,series1
0,2.0
1,3.0
2,
3,7.0
4,9.0

Unnamed: 0,series2
0,2.0
1,3.0
2,
3,7.0
4,9.0


In [83]:
# prod
print(s1.prod(), s2.prod())
print("-" * 100)
pprint_dataframes(df1, df2)
pprint_series(df1.prod(axis=0), df1.prod(axis=1), df1.prod(axis=1, min_count=1))

945 480.0
----------------------------------------------------------------------------------------------------


Unnamed: 0,A,B
0,1.0,3.0
1,2.0,4.0
2,,
3,7.0,
4,,9.0

Unnamed: 0,A,B,C
0,1,3,3
1,1,4,4
2,7,4,7
3,7,4,4


Unnamed: 0,series1
A,14.0
B,108.0

Unnamed: 0,series2
0,3.0
1,8.0
2,1.0
3,7.0
4,9.0

Unnamed: 0,series3
0,3.0
1,8.0
2,
3,7.0
4,9.0


In [72]:
# median
print(s1.median(), s1.quantile(0.5), s2.median(), s2.quantile(0.5))
print("-" * 100)
pprint_series(df1.median(), df1.median(axis=1))

5.0 5.0 5.0 5.0
----------------------------------------------------------------------------------------------------


Unnamed: 0,series1
A,2.0
B,4.0

Unnamed: 0,series2
0,2.0
1,3.0
2,
3,7.0
4,9.0


In [73]:
# min, max
print(s1.min(), s1.max(), s2.min(), s2.max())
print("-" * 100)
pprint_series(df1.min(), df1.min(axis=1), df1.max(), df1.max(axis=1))

1 9 2.0 10.0
----------------------------------------------------------------------------------------------------


Unnamed: 0,series1
A,1.0
B,3.0

Unnamed: 0,series2
0,1.0
1,2.0
2,
3,7.0
4,9.0

Unnamed: 0,series3
A,7.0
B,9.0

Unnamed: 0,series4
0,3.0
1,4.0
2,
3,7.0
4,9.0


In [75]:
# mode
s3 = pd.Series([1.0, 1.0, 2.0, 3.0])
pprint_series(s3.mode())
pprint_dataframes(df2, df2.mode(axis=0), df2.mode(axis=1))

Unnamed: 0,series1
0,1.0


Unnamed: 0,A,B,C
0,1,3,3
1,1,4,4
2,7,4,7
3,7,4,4

Unnamed: 0,A,B,C
0,1,4.0,4.0
1,7,,

Unnamed: 0,0
0,3
1,4
2,7
3,4


In [77]:
# abs
df3 = pd.DataFrame([[1, 3, -3.0], [1, np.nan, 4], [-7.0, 4, 7], [np.nan, 4, np.nan]], columns=list("ABC"))
pprint_dataframes(df3, df3.abs())

Unnamed: 0,A,B,C
0,1.0,3.0,-3.0
1,1.0,,4.0
2,-7.0,4.0,7.0
3,,4.0,

Unnamed: 0,A,B,C
0,1.0,3.0,3.0
1,1.0,,4.0
2,7.0,4.0,7.0
3,,4.0,


In [19]:
# var, std
df4 = pd.DataFrame({
    "person_id": [0, 1, 2, 3],
    "age": [21, 25, 62, 43],
    "height": [1.61, 1.87, 1.49, 2.01],
}).set_index("person_id")

pprint_pds(df4, df4.mean(), df4 - df4.mean(), np.pow(df4 - df4.mean(), 2), np.pow(df4 - df4.mean(), 2).sum()/(df4.count() - 1), np.pow(df4 - df4.mean(), 2).mean())

pprint_pds(df4, df4.var(), df4.var(ddof=0), df4.var() ** 0.5, df4.var(ddof=0) ** 0.5)

pprint_pds(df4, df4.std(), df4.std(ddof=0))

Unnamed: 0_level_0,age,height
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21,1.61
1,25,1.87
2,62,1.49
3,43,2.01

Unnamed: 0,SERIES2
age,37.75
height,1.745

Unnamed: 0_level_0,age,height
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-16.75,-0.135
1,-12.75,0.125
2,24.25,-0.255
3,5.25,0.265

Unnamed: 0_level_0,age,height
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,280.5625,0.018225
1,162.5625,0.015625
2,588.0625,0.065025
3,27.5625,0.070225

Unnamed: 0,SERIES5
age,352.916667
height,0.056367

Unnamed: 0,SERIES6
age,264.6875
height,0.042275


Unnamed: 0_level_0,age,height
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21,1.61
1,25,1.87
2,62,1.49
3,43,2.01

Unnamed: 0,SERIES2
age,352.916667
height,0.056367

Unnamed: 0,SERIES3
age,264.6875
height,0.042275

Unnamed: 0,SERIES4
age,18.786076
height,0.237417

Unnamed: 0,SERIES5
age,16.269219
height,0.205609


Unnamed: 0_level_0,age,height
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21,1.61
1,25,1.87
2,62,1.49
3,43,2.01

Unnamed: 0,SERIES2
age,18.786076
height,0.237417

Unnamed: 0,SERIES3
age,16.269219
height,0.205609


Unnamed: 0,SERIES1
0,1
1,3
2,5
3,7
4,9

Unnamed: 0,SERIES2
0,2.0
1,4.0
2,6.0
3,
4,10.0

Unnamed: 0,A,B
0,1.0,3.0
1,2.0,4.0
2,,
3,7.0,
4,,9.0
