# demo02_math.ipynb   统计学指标

In [25]:
import numpy as np
import pandas as pd

In [26]:
# 加载数据
ratings = pd.read_json('../data/ratings.json')
ratings

Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
Inception,2.5,3.0,2.5,,3,3.0,
Pulp Fiction,3.5,3.5,3.0,3.5,4,4.0,4.5
Anger Management,3.0,1.5,,3.0,2,,
Fracture,3.5,5.0,3.5,4.0,3,5.0,4.0
Serendipity,2.5,3.5,,2.5,2,3.5,1.0
Jerry Maguire,3.0,3.0,4.0,4.5,3,3.0,


In [27]:
fracture = ratings.loc['Fracture']
np.mean(fracture)
fracture.mean()

4.0

In [28]:
ratings.mean(axis=1)

Inception           2.800000
Pulp Fiction        3.714286
Anger Management    2.375000
Fracture            4.000000
Serendipity         2.500000
Jerry Maguire       3.416667
dtype: float64

## 加权平均值

In [29]:
np.average(fracture, weights=[2,3,3,1,1,1,1])

4.041666666666667

## 最值

In [32]:
# 获取一组数据的最大值
ratings.max(axis=1)

Inception           3.0
Pulp Fiction        4.5
Anger Management    3.0
Fracture            5.0
Serendipity         3.5
Jerry Maguire       4.5
dtype: float64

In [34]:
# 获取一组数据最大值的索引
ratings.idxmax(axis=1)

Inception           Michelle Peterson
Pulp Fiction            Michael Henry
Anger Management          John Carson
Fracture            Michelle Peterson
Serendipity         Michelle Peterson
Jerry Maguire          Jillian Hobart
dtype: object

In [36]:
# maximum()  minimum()
a = np.arange(1, 10)
b = a[::-1]
print(np.maximum(a, b))
print(np.minimum(a, b))

[9 8 7 6 5 6 7 8 9]
[1 2 3 4 5 4 3 2 1]


# 中位数

In [38]:
fracture.median()

4.0

In [41]:
ratings.median(axis=1)

Inception           3.0
Pulp Fiction        3.5
Anger Management    2.5
Fracture            4.0
Serendipity         2.5
Jerry Maguire       3.0
dtype: float64

## 频数与众数

In [66]:
data = pd.read_excel(
    '../data/电信用户流失数据/CustomerSurvival.xlsx')
data['流失用户'].value_counts()
data['流失用户'].mode()

0    1
dtype: int64

In [48]:
# 获取套餐金额、集团用户、使用月数的频数与众数
print(data['套餐金额'].value_counts() / len(data))
print(data['套餐金额'].mode())
print(data['集团用户'].value_counts())
print(data['集团用户'].mode())
print(data['使用月数'].value_counts())
print(data['使用月数'].mode())

1    0.948543
2    0.045226
3    0.006231
Name: 套餐金额, dtype: float64
0    1
dtype: int64
0    3844
1    1131
Name: 集团用户, dtype: int64
0    0
dtype: int64
13    1973
25    1020
14     354
12     292
1      164
11     112
2       90
10      89
16      87
9       87
3       80
8       71
17      58
4       56
24      52
7       52
19      48
18      48
23      44
21      43
5       43
6       40
22      38
20      26
15       8
Name: 使用月数, dtype: int64
0    13
dtype: int64


## 四分位数

In [49]:
# 额外通话时长的四分位数
data['额外通话时长'].quantile([0, .25, .5, .75, 1])

0.00   -2828.333333
0.25    -126.666667
0.50      13.500000
0.75     338.658333
1.00    4314.000000
Name: 额外通话时长, dtype: float64

In [50]:
# 额外流量时长的四分位数
data['额外流量'].quantile([0, .25, .5, .75, 1])

0.00   -2189.875986
0.25     -74.289824
0.50     -59.652734
0.75     -25.795045
1.00    2568.704293
Name: 额外流量, dtype: float64

## 标准差

In [55]:
vals = ratings['John Carson']
print(np.std(vals))    # np.std()
print(np.std(vals, ddof=1))
print(vals.std())      # series.std()

0.408248290463863
0.4472135954999579
0.4472135954999579


In [57]:
ratings.std(axis=1)

Inception           0.273861
Pulp Fiction        0.487950
Anger Management    0.750000
Fracture            0.763763
Serendipity         0.948683
Jerry Maguire       0.664580
dtype: float64

## 数值统计函数

In [59]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, Inception to Jerry Maguire
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   John Carson        6 non-null      float64
 1   Michelle Peterson  6 non-null      float64
 2   William Reynolds   4 non-null      float64
 3   Jillian Hobart     5 non-null      float64
 4   Melissa Jones      6 non-null      int64  
 5   Alex Roberts       5 non-null      float64
 6   Michael Henry      3 non-null      float64
dtypes: float64(6), int64(1)
memory usage: 544.0+ bytes


In [63]:
data = pd.read_csv('../data/泰坦尼克号生存数据/train.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [64]:
ratings.describe()

Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
count,6.0,6.0,4.0,5.0,6.0,5.0,3.0
mean,3.0,3.25,3.25,3.5,2.833333,3.7,3.166667
std,0.447214,1.129159,0.645497,0.790569,0.752773,0.83666,1.892969
min,2.5,1.5,2.5,2.5,2.0,3.0,1.0
25%,2.625,3.0,2.875,3.0,2.25,3.0,2.5
50%,3.0,3.25,3.25,3.5,3.0,3.5,4.0
75%,3.375,3.5,3.625,4.0,3.0,4.0,4.25
max,3.5,5.0,4.0,4.5,4.0,5.0,4.5


In [67]:
data.describe()

Unnamed: 0,ID,套餐金额,额外通话时长,额外流量,改变行为,服务合约,关联购买,集团用户,使用月数,流失用户
count,4975.0,4975.0,4975.0,4975.0,4975.0,4975.0,4975.0,4975.0,4975.0,4975.0
mean,2488.0,1.057688,258.52003,-71.580403,0.021307,0.245226,0.047437,0.227337,14.774271,0.782714
std,1436.303125,0.258527,723.05719,275.557448,0.144419,0.430264,0.278143,0.419154,6.534273,0.412441
min,1.0,1.0,-2828.333333,-2189.875986,0.0,0.0,0.0,0.0,1.0,0.0
25%,1244.5,1.0,-126.666667,-74.289824,0.0,0.0,0.0,0.0,13.0,1.0
50%,2488.0,1.0,13.5,-59.652734,0.0,0.0,0.0,0.0,13.0,1.0
75%,3731.5,1.0,338.658333,-25.795045,0.0,0.0,0.0,0.0,19.0,1.0
max,4975.0,3.0,4314.0,2568.704293,1.0,1.0,2.0,1.0,25.0,1.0


## 协方差

In [72]:
a = ratings['John Carson']
b = ratings['Michelle Peterson']
c = ratings['Melissa Jones']
# 均值
ma = a.mean()
mb = b.mean()
mc = c.mean()
# 离差
dev_a = a - ma
dev_b = b - mb
dev_c = c - mc
# 协方差
cov_ab = np.mean(dev_a * dev_b)
cov_bc = np.mean(dev_b * dev_c)
print(cov_ab, cov_bc)


0.16666666666666666 0.2916666666666667


In [73]:
# 相关系数  
coef_ab = cov_ab / (np.std(a) * np.std(b))
coef_bc = cov_bc / (np.std(b) * np.std(c))
print(coef_ab, coef_bc)

0.3960590171906697 0.4117647058823529


## 相关系数

In [76]:
np.corrcoef((a, b, c))   # np提供的相关性矩阵

array([[1.        , 0.39605902, 0.59408853],
       [0.39605902, 1.        , 0.41176471],
       [0.59408853, 0.41176471, 1.        ]])

In [77]:
ratings.corr()    # pandas提供的相关性矩阵

Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
John Carson,1.0,0.396059,0.40452,0.566947,0.594089,0.747018,0.991241
Michelle Peterson,0.396059,1.0,0.204598,0.31497,0.411765,0.963796,0.381246
William Reynolds,0.40452,0.204598,1.0,1.0,-0.258199,0.13484,-1.0
Jillian Hobart,0.566947,0.31497,1.0,1.0,0.566947,0.028571,0.893405
Melissa Jones,0.594089,0.411765,-0.258199,0.566947,1.0,0.211289,0.924473
Alex Roberts,0.747018,0.963796,0.13484,0.028571,0.211289,1.0,0.662849
Michael Henry,0.991241,0.381246,-1.0,0.893405,0.924473,0.662849,1.0


In [79]:
# 评估 extra_time  extra_flow 两列数据的相关性
np.corrcoef(data['额外通话时长'], data['额外流量'])

array([[1.        , 0.06908135],
       [0.06908135, 1.        ]])