# stat.ipynb  数值描述性统计分析指标示例

In [2]:
import pandas as pd
import numpy as np

In [3]:
ratings = pd.read_json('../data/ratings.json')
ratings

Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
Inception,2.5,3.0,2.5,,3,3.0,
Pulp Fiction,3.5,3.5,3.0,3.5,4,4.0,4.5
Anger Management,3.0,1.5,,3.0,2,,
Fracture,3.5,5.0,3.5,4.0,3,5.0,4.0
Serendipity,2.5,3.5,,2.5,2,3.5,1.0
Jerry Maguire,3.0,3.0,4.0,4.5,3,3.0,


# 均值

In [31]:
# 统计Fracture电影的平均分
ratings.loc['Fracture'].mean()
np.mean(ratings.loc['Fracture'])
ratings.loc['Jerry Maguire'].mean()  # 忽略NaN
ratings.mean()  # 默认垂直方向求均值
ratings.mean(axis=0) 

John Carson          3.000000
Michelle Peterson    3.250000
William Reynolds     3.250000
Jillian Hobart       3.500000
Melissa Jones        2.833333
Alex Roberts         3.700000
Michael Henry        3.166667
dtype: float64

# 加权均值

In [16]:
# 加权平均值
samples = ratings.loc['Fracture']
weights = np.array([6, 1, 1, 1, 1, 1, 1])
print(np.average(samples, weights=weights))
ratings

3.7916666666666665


Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
Inception,2.5,3.0,2.5,,3,3.0,
Pulp Fiction,3.5,3.5,3.0,3.5,4,4.0,4.5
Anger Management,3.0,1.5,,3.0,2,,
Fracture,3.5,5.0,3.5,4.0,3,5.0,4.0
Serendipity,2.5,3.5,,2.5,2,3.5,1.0
Jerry Maguire,3.0,3.0,4.0,4.5,3,3.0,


# 最值

In [30]:
np.max(ratings['John Carson'])
# np.argmax(ratings['John Carson'])
ratings['John Carson'].idxmax()
ratings['John Carson'].idxmin()

# dataframe
ratings.idxmax()
ratings.idxmax(axis=1)


Inception           Michelle Peterson
Pulp Fiction            Michael Henry
Anger Management          John Carson
Fracture            Michelle Peterson
Serendipity         Michelle Peterson
Jerry Maguire          Jillian Hobart
dtype: object

In [33]:
np.minimum(ratings['John Carson'], ratings['Michelle Peterson'])

Inception           2.5
Pulp Fiction        3.5
Anger Management    1.5
Fracture            3.5
Serendipity         2.5
Jerry Maguire       3.0
dtype: float64

# 中位数

In [39]:
ratings['John Carson'].median()
np.median(ratings['John Carson'])
ratings.median()
ratings.median(axis=1)

Inception           3.0
Pulp Fiction        3.5
Anger Management    2.5
Fracture            4.0
Serendipity         2.5
Jerry Maguire       3.0
dtype: float64

In [29]:
ratings

Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
Inception,2.5,3.0,2.5,,3,3.0,
Pulp Fiction,3.5,3.5,3.0,3.5,4,4.0,4.5
Anger Management,3.0,1.5,,3.0,2,,
Fracture,3.5,5.0,3.5,4.0,3,5.0,4.0
Serendipity,2.5,3.5,,2.5,2,3.5,1.0
Jerry Maguire,3.0,3.0,4.0,4.5,3,3.0,


# 频数与众数

In [49]:
cars = np.array(['bmw', 'bmw', 'bz', 'audi', 'bz', 'bmw'])
cars = pd.Series(cars)
cars.value_counts()
cars.mode()
# 加载电信数据
data = pd.read_excel('../data/电信用户流失数据/CustomerSurvival.xlsx')
data['流失用户'].value_counts()  # 频数
data['流失用户'].mode()  # 众数

0    1
dtype: int64

# 四分位数

In [53]:
data['额外通话时长'].mean()
data['额外通话时长'].quantile([0, .25, .5, .75, 1])

0.00   -2828.333333
0.25    -126.666667
0.50      13.500000
0.75     338.658333
1.00    4314.000000
Name: 额外通话时长, dtype: float64

In [57]:
data['额外流量'].mean()
data['额外流量'].quantile([0, .25, .5, .75, 1])

0.00   -2189.875986
0.25     -74.289824
0.50     -59.652734
0.75     -25.795045
1.00    2568.704293
Name: 额外流量, dtype: float64

In [45]:
data.head()

Unnamed: 0,ID,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数,流失用户
0,1,1,792.833333,-10.450067,0,0,0,0,25,0
1,2,1,121.666667,-21.141117,0,0,0,0,25,0
2,3,1,-30.0,-25.655273,0,0,0,0,2,1
3,4,1,241.5,-288.341254,0,1,0,1,25,0
4,5,1,1629.666667,-23.655505,0,0,0,1,25,0


# 标准差

In [64]:
print(ratings.loc['Pulp Fiction'].std(), ratings.loc['Pulp Fiction'].mean())
print(ratings.loc['Fracture'].std(), ratings.loc['Fracture'].mean())

# 总体标准差  样本标准差
print(np.std(ratings.loc['Pulp Fiction']))  # 总体
print(np.std(ratings.loc['Pulp Fiction'], ddof=1))  # 样本

0.48795003647426655 3.7142857142857144
0.7637626158259734 4.0
0.4517539514526256
0.48795003647426655


In [67]:
ratings.std(axis=0)

John Carson          0.447214
Michelle Peterson    1.129159
William Reynolds     0.645497
Jillian Hobart       0.790569
Melissa Jones        0.752773
Alex Roberts         0.836660
Michael Henry        1.892969
dtype: float64

In [69]:
ratings.describe()   # 宏观描述性统计
data.describe()

Unnamed: 0,ID,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数,流失用户
count,4975.0,4975.0,4975.0,4975.0,4975.0,4975.0,4975.0,4975.0,4975.0,4975.0
mean,2488.0,1.057688,258.52003,-71.580403,0.021307,0.245226,0.047437,0.227337,14.774271,0.782714
std,1436.303125,0.258527,723.05719,275.557448,0.144419,0.430264,0.278143,0.419154,6.534273,0.412441
min,1.0,1.0,-2828.333333,-2189.875986,0.0,0.0,0.0,0.0,1.0,0.0
25%,1244.5,1.0,-126.666667,-74.289824,0.0,0.0,0.0,0.0,13.0,1.0
50%,2488.0,1.0,13.5,-59.652734,0.0,0.0,0.0,0.0,13.0,1.0
75%,3731.5,1.0,338.658333,-25.795045,0.0,0.0,0.0,0.0,19.0,1.0
max,4975.0,3.0,4314.0,2568.704293,1.0,1.0,2.0,1.0,25.0,1.0


# 协方差，相关矩阵，相关系数

In [77]:
a = ratings['John Carson']
b = ratings['Michelle Peterson']
# 计算两组数据的协方差
a_mean = a.mean()
b_mean = b.mean()
a_dev = a - a_mean
b_dev = b - b_mean
# 协方差
cov_ab = np.mean(a_dev * b_dev)
cov_ab

0.16666666666666666

##### 计算得到的协方差可以评估正相关或反相关，但评估不了两组数据的相关程度。
##### 所以通常计算a与b的cov_ab， b与c的cov_bc，然后进行比较，从而得到b与谁相关性更强。

## 相关系数

In [83]:
cor = cov_ab / (np.std(a) * np.std(b))
print(cor)
print(np.corrcoef(a, b))
print(np.corrcoef(a, b)[0, 1])
# 输出三组数据之间的相关系数
c = ratings['Melissa Jones']
print(np.corrcoef((a, b, c)))

0.3960590171906697
[[1.         0.39605902]
 [0.39605902 1.        ]]
0.3960590171906697
[[1.         0.39605902 0.59408853]
 [0.39605902 1.         0.41176471]
 [0.59408853 0.41176471 1.        ]]


In [84]:
ratings.corr()

Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
John Carson,1.0,0.396059,0.40452,0.566947,0.594089,0.747018,0.991241
Michelle Peterson,0.396059,1.0,0.204598,0.31497,0.411765,0.963796,0.381246
William Reynolds,0.40452,0.204598,1.0,1.0,-0.258199,0.13484,-1.0
Jillian Hobart,0.566947,0.31497,1.0,1.0,0.566947,0.028571,0.893405
Melissa Jones,0.594089,0.411765,-0.258199,0.566947,1.0,0.211289,0.924473
Alex Roberts,0.747018,0.963796,0.13484,0.028571,0.211289,1.0,0.662849
Michael Henry,0.991241,0.381246,-1.0,0.893405,0.924473,0.662849,1.0


In [58]:
ratings

Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
Inception,2.5,3.0,2.5,,3,3.0,
Pulp Fiction,3.5,3.5,3.0,3.5,4,4.0,4.5
Anger Management,3.0,1.5,,3.0,2,,
Fracture,3.5,5.0,3.5,4.0,3,5.0,4.0
Serendipity,2.5,3.5,,2.5,2,3.5,1.0
Jerry Maguire,3.0,3.0,4.0,4.5,3,3.0,
