In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
from sas7bdat import SAS7BDAT

In [3]:
data = {
    'mpg': [21.0, 21.0, 22.8, 21.4, 18.7, 18.1],
    'hp': [110, 110, 93, 110, 175, 105],
    'wt': [2.62, 2.88, 2.32, 3.21, 3.44, 3.46]
}

In [4]:
index = ['Mazada RX4', 'Mazada RX4 Wag', 'Datsun 710', 'Hornet 4 Drive', 'Hornet Sportabout', 'Valiant']

In [5]:
df = pd.DataFrame(data, index=index)
df

Unnamed: 0,mpg,hp,wt
Mazada RX4,21.0,110,2.62
Mazada RX4 Wag,21.0,110,2.88
Datsun 710,22.8,93,2.32
Hornet 4 Drive,21.4,110,3.21
Hornet Sportabout,18.7,175,3.44
Valiant,18.1,105,3.46


In [6]:
df.describe()

Unnamed: 0,mpg,hp,wt
count,6.0,6.0,6.0
mean,20.5,117.166667,2.988333
std,1.766352,29.088944,0.462533
min,18.1,93.0,2.32
25%,19.275,106.25,2.685
50%,21.0,110.0,3.045
75%,21.3,110.0,3.3825
max,22.8,175.0,3.46


In [7]:
# oandas 峰度和偏度函数
'''
偏度描述了数据分布的不对称程度。
正偏度表示数据分布向右偏斜，即数据右侧的尾部更长。
负偏度表示数据分布向左偏斜，即数据左侧的尾部更长。
偏度的绝对值越大，表示数据分布的偏斜程度越严重。
偏度为0表示数据分布相对对称，即左右两侧的尾部长度大致相等。
'''
df.skew() # 偏度

mpg   -0.354381
hp     2.150177
wt    -0.429910
dtype: float64

In [8]:
'''
峰度描述了数据分布的峰态，即数据分布的陡峭程度和尾部的厚度。
正峰度表示数据分布更加尖峭、集中，尾部更重。
负峰度表示数据分布更加平坦、散开，尾部更轻。
峰度为0表示数据分布与正态分布相似，称为正态分布（Mesokurtic）。
正峰度（Kurtosis > 0）表示高峰态（Leptokurtic）。
负峰度（Kurtosis < 0）表示低峰态（Platykurtic）。
'''
df.kurtosis() # 峰度

mpg   -1.040816
hp     5.049650
wt    -1.484599
dtype: float64

In [9]:
df.agg([np.mean, np.std])

  df.agg([np.mean, np.std])
  df.agg([np.mean, np.std])


Unnamed: 0,mpg,hp,wt
mean,20.5,117.166667,2.988333
std,1.766352,29.088944,0.462533


In [10]:
# 创建示例数据
data = pd.Series(np.random.normal(0, 1, 1000))  # 正态分布的随机数据
data

0     -0.514167
1     -0.416435
2      0.234829
3      0.258783
4      0.123001
         ...   
995    0.265797
996   -0.649117
997   -0.571829
998   -0.524246
999    0.355335
Length: 1000, dtype: float64

In [11]:
# 计算置信区间
confidence_level = 0.95 
mean = np.mean(data)
std_dev = np.std(data)
sample_size = len(data)
margin_error = stats.norm.ppf((1 + confidence_level) / 2) * std_dev / np.sqrt(sample_size)
lower_bound = mean - margin_error
upper_bound = mean + margin_error

print(f"Confidence Interval ({confidence_level * 100}%): [{lower_bound}, {upper_bound}]")

Confidence Interval (95.0%): [-0.07116661911334861, 0.05202959490717829]


In [13]:
df = pd.read_excel(r'data_set\data.xlsx')

In [14]:
df.columns

Index(['ID', 'Treatment', 'Sex', 'Age', 'Improved'], dtype='object')

In [15]:
# 频数统计与频数比例统计
# value_counts()统计字段中每个值的个数
# normalize参数，统计字段中每个值的占比
df.Improved.value_counts(normalize=True) * 100

Improved
Marked    66.666667
Some      33.333333
Name: proportion, dtype: float64

In [16]:
# 多维频数计算与频数比例统计
# 计算频数
freq_table = df[['Treatment', 'Improved']].value_counts()
freq_table

Treatment  Improved
Treated    Marked      21
Placebo    Marked       7
           Some         7
Treated    Some         7
Name: count, dtype: int64

In [17]:
freq_table / freq_table.sum()

Treatment  Improved
Treated    Marked      0.500000
Placebo    Marked      0.166667
           Some        0.166667
Treated    Some        0.166667
Name: count, dtype: float64

In [18]:
df[['Treatment', 'Improved']].value_counts(normalize=True)

Treatment  Improved
Treated    Marked      0.500000
Placebo    Marked      0.166667
           Some        0.166667
Treated    Some        0.166667
Name: proportion, dtype: float64

In [19]:
# 先算分母
m = df.Treatment.value_counts().to_frame().reset_index()

In [20]:
# 再算分子
n = df[['Treatment', 'Improved']].value_counts().to_frame().reset_index()

In [21]:
# 再统计频数占比
d = n.merge(m, on='Treatment', how='inner')
d.proption = d.count_x / d.count_y
d

  d.proption = d.count_x / d.count_y


Unnamed: 0,Treatment,Improved,count_x,count_y
0,Treated,Marked,21,41
1,Placebo,Marked,7,43
2,Placebo,Some,7,43
3,Treated,Some,7,41


In [22]:
# 计算多维数据的频数和占比用crosstab更快捷
# 根据normalize的不同参数做行列占比计算（all，index，columns）
pd.crosstab(df.Treatment, df.Improved, dropna=False, margins=True)

Improved,Marked,Some,NaN,All
Treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Placebo,7,7,29,43
Treated,21,7,13,41
All,28,14,0,84


In [23]:
pd.crosstab([df['Treatment'], df['Sex']], df['Improved'], rownames=['Treatment', 'Sex'], colnames=['Improved'], dropna=False, margins=True, normalize='index')

Unnamed: 0_level_0,Improved,Marked,Some,NaN
Treatment,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Placebo,Female,0.1875,0.21875,0.59375
Placebo,Male,0.090909,0.0,0.909091
Treated,Female,0.592593,0.185185,0.222222
Treated,Male,0.357143,0.142857,0.5
All,,0.666667,0.333333,0.0


In [24]:
pd.crosstab([df['Treatment'], df['Improved']], df['Sex'], rownames=['Treatment', 'Improved'], colnames=['Sex'], dropna=False, margins=True, normalize='index')*100

Unnamed: 0_level_0,Sex,Female,Male
Treatment,Improved,Unnamed: 2_level_1,Unnamed: 3_level_1
Placebo,Marked,85.714286,14.285714
Placebo,Some,100.0,0.0
Placebo,,65.517241,34.482759
Treated,Marked,76.190476,23.809524
Treated,Some,71.428571,28.571429
Treated,,46.153846,53.846154
All,,70.238095,29.761905


In [25]:
# 卡方独立性检验
# 用来确定两个分类变量之间是否存在关联关系的统计方法
# 比较实际观察到的频数与期望频数之间的差异，如果差异显著，则推断这两个变量之间存在关联
from scipy.stats import contingency

In [26]:
contingency_table = pd.crosstab(df['Treatment'], df['Improved'], dropna=False, margins=True)
contingency_table

Improved,Marked,Some,NaN,All
Treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Placebo,7,7,29,43
Treated,21,7,13,41
All,28,14,0,84


In [27]:
pd.crosstab(df['Treatment'], df['Improved'], dropna=False, margins=True)

Improved,Marked,Some,NaN,All
Treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Placebo,7,7,29,43
Treated,21,7,13,41
All,28,14,0,84


In [28]:
contingency.chi2_contingency(contingency_table.to_numpy())

Chi2ContingencyResult(statistic=52.86330119115145, pvalue=1.2500593700276932e-09, dof=6, expected_freq=array([[16.38095238,  8.19047619, 12.28571429, 49.14285714],
       [15.61904762,  7.80952381, 11.71428571, 46.85714286],
       [24.        , 12.        , 18.        , 72.        ]]))