# 第五章 SciPy概率分布与统计分析实践

In [None]:
"""
本代码为《Python心理学应用》一书第五章SciPy概率分布与统计分析实践中的全部代码
建议结合书本内容进行实操练习，以增进理解
魏楚光，weicg@psych.ac.cn 
8-Dec-2024
"""

### 代码1.1

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

# 参数设置
mean = 0    # 均值
std = 1      # 标准差
size = 1000  # 生成数据的数量

# 生成正态分布的随机数据
data = norm.rvs(loc=mean, scale=std, size=size)

# 计算正态分布的PDF和CDF
x = np.linspace(-5, 5, 1000)
pdf = norm.pdf(x, loc=mean, scale=std)  # PDF
cdf = norm.cdf(x, loc=mean, scale=std)  # CDF

# 绘制正态分布的图形
plt.figure(figsize=(12, 6))

# 绘制PDF和数据直方图
plt.subplot(1, 2, 1)
plt.hist(data, bins=30, density=True, alpha=0.6, color='orange', label='Data Histogram')  # 数据直方图
plt.plot(x, pdf, label='Theoretical PDF', color='blue')  # 理论PDF
plt.title('Normal Distribution PDF vs Data')
plt.xlabel('x')
plt.ylabel('Probability Density')
plt.legend()

# 绘制CDF和数据经验CDF
plt.subplot(1, 2, 2)
plt.hist(data, bins=30, density=True, cumulative=True, alpha=0.6, color='orange', label='Empirical CDF')  # 数据经验CDF
plt.plot(x, cdf, label='Theoretical CDF', color='green')  # 理论CDF
plt.title('Normal Distribution CDF vs Data')
plt.xlabel('x')
plt.ylabel('Cumulative Probability')
plt.legend()

plt.tight_layout()
plt.savefig('norm_distribution.tiff',dpi=300)
plt.show()

### 代码1.2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import uniform

# 参数设置
low = 0      # 下界
high = 1     # 上界
size = 1000  # 生成数据的数量

# 生成均匀分布的随机数据
data = uniform.rvs(loc=low, scale=high-low, size=size)

# 计算均匀分布的PDF和CDF
x = np.linspace(-0.5, 1.5, 1000)
pdf = uniform.pdf(x, loc=low, scale=high-low)  # PDF
cdf = uniform.cdf(x, loc=low, scale=high-low)  # CDF

# 绘制均匀分布的图形
plt.figure(figsize=(12, 6))

# 绘制PDF和数据直方图
plt.subplot(1, 2, 1)
plt.hist(data, bins=30, density=True, alpha=0.6, color='orange', label='Data Histogram')  # 数据直方图
plt.plot(x, pdf, label='Theoretical PDF', color='blue')  # 理论PDF
plt.title('Uniform Distribution PDF vs Data')
plt.xlabel('x')
plt.ylabel('Probability Density')
plt.legend()

# 绘制CDF和数据经验CDF
plt.subplot(1, 2, 2)
plt.hist(data, bins=30, density=True, cumulative=True, alpha=0.6, color='orange', label='Empirical CDF')  # 数据经验CDF
plt.plot(x, cdf, label='Theoretical CDF', color='green')  # 理论CDF
plt.title('Uniform Distribution CDF vs Data')
plt.xlabel('x')
plt.ylabel('Cumulative Probability')
plt.legend()

plt.tight_layout()
plt.savefig('uniform_distribution.tiff',dpi=300)
plt.show()

### 代码1.3

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import binom

# 参数设置
n = 10   # 试验次数
p = 0.5  # 成功的概率
size = 1000  # 生成数据的数量

# 生成二项分布的随机数据
data = binom.rvs(n=n, p=p, size=size)

# 计算二项分布的PMF（概率质量函数）和CDF
x = np.arange(0, n+1)
pmf = binom.pmf(x, n=n, p=p)  # PMF
cdf = binom.cdf(x, n=n, p=p)  # CDF

# 绘制二项分布的图形
plt.figure(figsize=(12, 6))

# 绘制PMF和数据直方图
plt.subplot(1, 2, 1)
plt.hist(data, bins=np.arange(-0.5, n+1.5, 1), density=True, alpha=0.6, color='orange', label='Data Histogram')  # 数据直方图
plt.stem(x, pmf, label='Theoretical PMF', basefmt=" ")  # 理论PMF
plt.title('Binomial Distribution PMF vs Data')
plt.xlabel('x')
plt.ylabel('Probability')
plt.legend()

# 绘制CDF和数据经验CDF
plt.subplot(1, 2, 2)
plt.hist(data, bins=np.arange(-0.5, n+1.5, 1), density=True, cumulative=True, alpha=0.6, color='orange', label='Empirical CDF')  # 数据经验CDF
plt.plot(x, cdf, label='Theoretical CDF', color='green')  # 理论CDF
plt.title('Binomial Distribution CDF vs Data')
plt.xlabel('x')
plt.ylabel('Cumulative Probability')
plt.legend()

plt.tight_layout()
#plt.savefig('binom_distribution.tiff',dpi=300)
plt.show()

### 代码1.4

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson

# 参数设置
mu = 3    # 事件的平均发生次数
size = 1000  # 生成数据的数量

# 生成泊松分布的随机数据
data = poisson.rvs(mu, size=size)

# 计算泊松分布的PMF和CDF
x = np.arange(0, 10)  # x 的取值范围
pmf = poisson.pmf(x, mu)  # PMF
cdf = poisson.cdf(x, mu)  # CDF

# 绘制泊松分布的图形
plt.figure(figsize=(12, 6))

# 绘制PMF和数据直方图
plt.subplot(1, 2, 1)
plt.hist(data, bins=np.arange(-0.5, 10.5, 1), density=True, alpha=0.6, color='orange', label='Data Histogram')  # 数据直方图
plt.stem(x, pmf, label='Theoretical PMF', basefmt=" ")  # 理论PMF
plt.title('Poisson Distribution PMF vs Data')
plt.xlabel('x')
plt.ylabel('Probability')
plt.legend()

# 绘制CDF和数据经验CDF
plt.subplot(1, 2, 2)
plt.hist(data, bins=np.arange(-0.5, 10.5, 1), density=True, cumulative=True, alpha=0.6, color='orange', label='Empirical CDF')  # 数据经验CDF
plt.plot(x, cdf, label='Theoretical CDF', color='green')  # 理论CDF
plt.title('Poisson Distribution CDF vs Data')
plt.xlabel('x')
plt.ylabel('Cumulative Probability')
plt.legend()

plt.tight_layout()
plt.savefig('poisson_distribution.tiff',dpi=300)
plt.show()

### 代码2.1

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# 创建一个随机数据集
data = np.random.normal(loc=50, scale=10, size=1000)  # 正态分布数据，均值50，标准差10

# 计算均值
mean_val = np.mean(data)
print(f"Mean (均值): {mean_val}")

# 计算中位数
median_val = np.median(data)
print(f"Median (中位数): {median_val}")

# 计算近似众数
hist, bin_edges = np.histogram(data, bins=30)
mode_bin_index = np.argmax(hist)
mode_value = (bin_edges[mode_bin_index] + bin_edges[mode_bin_index + 1]) / 2
print(f"Approximate Mode (近似众数): {mode_value}")

# 绘制数据的分布
plt.hist(data, bins=30, alpha=0.7, color='blue', edgecolor='black')
plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean_val:.2f}')
plt.axvline(median_val, color='green', linestyle='dashed', linewidth=2, label=f'Median: {median_val:.2f}')
plt.axvline(mode_value, color='orange', linestyle='dashed', linewidth=2, label=f'Approx. Mode: {mode_value:.2f}')

plt.title('Distribution of Data with Central Tendency Measures')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
plt.show()

### 代码2.2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# 创建一个随机数据集
data = np.random.normal(loc=50, scale=10, size=1000)  # 正态分布数据，均值50，标准差10

# 计算方差
variance = np.var(data)
print(f"Variance (方差): {variance:.2f}")

# 计算标准差
std_deviation = np.std(data)
print(f"Standard Deviation (标准差): {std_deviation:.2f}")

# 计算变异系数（标准差 / 均值）
coefficient_of_variation = std_deviation / np.mean(data)
print(f"Coefficient of Variation (变异系数): {coefficient_of_variation:.2f}")

# 计算偏度（Skewness）
skewness = stats.skew(data)
print(f"Skewness (偏度): {skewness:.2f}")

# 计算峰度（Kurtosis）
kurtosis = stats.kurtosis(data)
print(f"Kurtosis (峰度): {kurtosis:.2f}")

# 绘制数据的直方图
plt.figure(figsize=(10, 6))
plt.hist(data, bins=30, edgecolor='black', alpha=0.7, color='skyblue')

# 绘制均值和标准差线
mean_val = np.mean(data)
median_val = np.median(data)
plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean_val:.2f}')
plt.axvline(median_val, color='green', linestyle='dashed', linewidth=2, label=f'Median: {median_val:.2f}')
plt.axvline(mean_val + std_deviation, color='orange', linestyle='dashed', linewidth=2, label=f'Standard Deviation: {std_deviation:.2f}')
plt.axvline(mean_val - std_deviation, color='orange', linestyle='dashed', linewidth=2)

# 添加标题和标签
plt.title("Histogram of Data with Mean and Standard Deviation")
plt.xlabel("Value")
plt.ylabel("Frequency")

# 显示图例
plt.legend()

# 展示图形
plt.show()

### 代码2.3

In [None]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# 生成一个正态分布的数据集
data = np.random.normal(loc=50, scale=10, size=1000)

# 计算偏度和峰度
skewness = stats.skew(data)
kurtosis = stats.kurtosis(data)

# 输出结果
print(f"Skewness (偏度): {skewness}")
print(f"Kurtosis (峰度): {kurtosis}")

# 绘制数据的直方图
plt.hist(data, bins=30, edgecolor='black', alpha=0.7)
plt.title("Histogram of Normal Distribution")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

### 代码3.1

In [None]:
import numpy as np
from scipy import stats

# 模拟一组数据
sample_data = np.array([52, 49, 51, 50, 53, 48, 55, 50, 49, 52, 57, 56, 54, 58, 59])

# 进行 Shapiro-Wilk 正态性检验
shapiro_test = stats.shapiro(sample_data)
print(f"Shapiro-Wilk 正态性检验统计量: {shapiro_test.statistic:.3f}, p 值: {shapiro_test.pvalue:.3f}")

# 设置假设的总体均值
population_mean = 50

# 判断数据是否符合正态性
if shapiro_test.pvalue > 0.05:
    print("数据服从正态分布，可以进行 t 检验。")
    
    # 执行单样本 t 检验
    t_statistic, p_value = stats.ttest_1samp(sample_data, population_mean)
    
    # 输出结果
    print(f"T-statistic: {t_statistic:.3f}")
    print(f"P-value: {p_value:.3f}")
    
    # 判断是否拒绝零假设
    if p_value < 0.05:
        print("拒绝零假设：样本均值显著不同于总体均值")
    else:
        print("未拒绝零假设：样本均值与总体均值无显著差异")

else:
    print("数据可能不服从正态分布，建议使用非参数检验（如 Wilcoxon 符号秩检验）。")

    # 进行 Wilcoxon 符号秩检验（适用于非正态数据）
    wilcoxon_test = stats.wilcoxon(sample_data - population_mean)
    print(f"Wilcoxon 检验统计量: {wilcoxon_test.statistic:.3f}, p 值: {wilcoxon_test.pvalue:.3f}")
    
    if wilcoxon_test.pvalue < 0.05:
        print("拒绝零假设：样本均值显著不同于总体均值（非参数检验）")
    else:
        print("未拒绝零假设：样本均值与总体均值无显著差异（非参数检验）")

### 代码3.2

In [None]:
from scipy import stats

# 假设我们的样本数据如下
data = [22, 24, 21, 25, 23, 26, 20, 24, 23, 25, 24, 27, 22, 28, 29, 25, 24, 23, 26, 27]

# 设定总体均值
mu_0 = 23

# 进行单样本 t 检验（单侧检验，右尾检验：检验样本均值是否大于总体均值）
t_statistic, p_value = stats.ttest_1samp(data, mu_0)

# 单侧检验：右尾检验（p_value 是双侧的，因此需要除以2）
p_value_one_sided = p_value / 2

# 如果我们关心样本均值是否大于总体均值
if t_statistic > 0 and p_value_one_sided < 0.05:
    print(f"拒绝零假设，样本均值显著大于总体均值，t-statistic: {t_statistic}, p-value: {p_value_one_sided}")
else:
    print(f"未拒绝零假设，样本均值不显著大于总体均值，t-statistic: {t_statistic}, p-value: {p_value_one_sided}")

### 代码3.3

In [None]:
import numpy as np
from scipy import stats

# 生成两个独立样本数据
np.random.seed(42)
group1 = np.random.normal(loc=50, scale=10, size=30)  # 均值50，标准差10
group2 = np.random.normal(loc=55, scale=10, size=30)  # 均值55，标准差10

# 检查方差齐性（Levene检验）
levene_test = stats.levene(group1, group2)
print("Levene检验结果（p值）：", levene_test.pvalue)

# 根据方差齐性结果选择 t检验类型
if levene_test.pvalue > 0.05:
    print("方差齐性假设成立，使用标准独立样本 t检验")
    t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=True)
else:
    print("方差齐性假设不成立，使用 Welch's t检验")
    t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=False)
    
# 进行独立样本 t 检验（默认假设方差相等）
t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=True)

# 输出结果
print(f"t 统计量: {t_stat:.3f}, p 值: {p_value:.3f}")

### 代码3.4 

In [None]:
import numpy as np
from scipy import stats

# 生成两组独立样本数据
np.random.seed(42)
group1 = np.random.normal(loc=50, scale=10, size=30)  # 均值50，标准差10
group2 = np.random.normal(loc=55, scale=10, size=30)  # 均值55，标准差10

# 进行独立样本 t 检验（默认双侧检验）
t_stat, p_two_tailed = stats.ttest_ind(group1, group2, equal_var=True)

# 计算单侧 p 值（假设 H1: group1 < group2，即左单侧检验）
p_one_tailed = p_two_tailed / 2 if t_stat < 0 else 1 - (p_two_tailed / 2)

# 输出结果
print(f"t 统计量: {t_stat:.3f}")
print(f"双侧 p 值: {p_two_tailed:.3f}")
print(f"单侧 p 值（H1: group1 < group2）: {p_one_tailed:.3f}")

# 设定显著性水平 α = 0.05
alpha = 0.05
if p_one_tailed < alpha:
    print("拒绝原假设，group1 的均值显著小于 group2。")
else:
    print("未能拒绝原假设，group1 的均值未显著小于 group2。")

### 代码3.5

In [None]:
import numpy as np
from scipy import stats

# 假设我们有两个样本数据，分别表示某实验的前后测量值
before = np.array([23, 21, 18, 25, 20, 17, 22, 24, 26, 23])
after = np.array([26, 24, 21, 28, 22, 20, 23, 25, 27, 26])

# 进行配对样本t检验
t_stat, p_value = stats.ttest_rel(before, after)

# 输出结果
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

# 判断是否拒绝零假设
if p_value < 0.05:
    print("拒绝零假设：配对样本之间存在显著差异")
else:
    print("未拒绝零假设：配对样本之间无显著差异")

### 代码3.6

In [None]:
import numpy as np
from scipy import stats

# 模拟数据：实验前后某个测量（例如，干预前后测量某个变量），增加样本量
before = np.array([100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 
                  150, 155, 160, 165, 170, 175, 180, 185, 190, 195])  # 实验前的测量值
after = np.array([98, 107, 109, 113, 118, 126, 132, 138, 143, 147, 
                 151, 157, 162, 167, 173, 178, 183, 188, 193, 199])   # 实验后的测量值

# 计算差值
differences = after - before

# 进行单侧配对样本 t 检验（假设我们关心实验后是否有显著的增大）
t_stat, p_value = stats.ttest_1samp(differences, 0)  # 以0为检验值

# 单侧检验：检查实验后是否有显著增大，即看 t 统计量的方向
p_value_one_sided = p_value / 2  # 单侧 p 值

if t_stat > 0:
    print(f"T-statistic: {t_stat}")
    print(f"P-value (one-sided): {p_value_one_sided}")
    if p_value_one_sided < 0.05:  # 设定显著性水平为 0.05
        print("拒绝零假设：实验后有显著增大")
    else:
        print("未拒绝零假设：实验后没有显著增大")
else:
    print(f"T-statistic: {t_stat}")
    print(f"P-value (one-sided): {p_value_one_sided}")
    print("不能拒绝零假设：实验后没有显著增大")

### 代码3.7

In [None]:
import numpy as np
from scipy import stats

# 假设我们有一组观测数据
observed = np.array([50, 30, 20])  # 观察到的频率

# 假设我们有一组预期的频率
expected = np.array([40, 40, 20])  # 预期的频率

# 进行卡方检验
chi2_stat, p_value = stats.chisquare(observed, expected)

# 输出结果
print(f"Chi-square Statistic: {chi2_stat}")
print(f"P-value: {p_value}")

# 根据p值做出决策
if p_value < 0.05:
    print("拒绝零假设：观察到的频率与预期的频率有显著差异。")
else:
    print("未拒绝零假设：观察到的频率与预期的频率没有显著差异。")

### 代码3.8

In [None]:
import numpy as np
from scipy import stats

# 创建三个组的样本数据
group1 = [23, 21, 18, 25, 30, 22, 28, 26, 24, 27]
group2 = [33, 31, 30, 32, 35, 34, 33, 31, 32, 30]
group3 = [41, 38, 35, 39, 42, 40, 41, 37, 43, 38]

# 执行单因素方差分析
f_statistic, p_value = stats.f_oneway(group1, group2, group3)

# 输出结果
print(f"F统计量: {f_statistic}")
print(f"P值: {p_value}")

# 判断是否拒绝零假设
if p_value < 0.05:
    print("拒绝零假设：不同组之间的均值存在显著差异")
else:
    print("未拒绝零假设：不同组之间的均值没有显著差异")

### 代码3.9

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# 创建三个组的样本数据
group1 = [23, 21, 18, 25, 30, 22, 28, 26, 24, 27]
group2 = [33, 31, 30, 32, 35, 34, 33, 31, 32, 30]
group3 = [41, 38, 35, 39, 42, 40, 41, 37, 43, 38]

# 创建一个DataFrame来组织数据
data = pd.DataFrame({
    'value': group1 + group2 + group3,
    'group': ['group1']*len(group1) + ['group2']*len(group2) + ['group3']*len(group3)
})

# 使用ols函数进行单因素方差分析
model = ols('value ~ group', data=data).fit()
anova_table = sm.stats.anova_lm(model, typ=1)
print(anova_table)

# 事后比较（Tukey HSD）
tukey = pairwise_tukeyhsd(endog=data['value'], groups=data['group'], alpha=0.05)
print(tukey)

### 代码3.10

In [None]:
import pandas as pd
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# 模拟数据：三次测量，5个被试
data = {
    'subject': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5],
    'time': ['T1', 'T2', 'T3', 'T1', 'T2', 'T3', 'T1', 'T2', 'T3', 'T1', 'T2', 'T3', 'T1', 'T2', 'T3'],
    'score': [23, 25, 30, 30, 32, 34, 20, 22, 25, 25, 28, 30, 20, 21, 24]
}

# 转换为DataFrame
df = pd.DataFrame(data)

# 执行重复测量方差分析
aovrm = AnovaRM(df, 'score', 'subject', within=['time'])
result = aovrm.fit()

# 输出重复测量方差分析结果
print(result)

# 进行事后比较：Tukey HSD
tukey = pairwise_tukeyhsd(df['score'], df['time'], alpha=0.05)
print("\nTukey HSD Post-Hoc Test Results:")
print(tukey)

### 代码3.11

In [None]:
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# 模拟数据：两个因素，分别为 "treatment" 和 "gender"
data = {
    'treatment': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C'],
    'gender': ['M', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'M'],
    'score': [23, 25, 30, 32, 34, 28, 35, 36, 40]
}

# 转换为DataFrame
df = pd.DataFrame(data)

# 构建模型
model = ols('score ~ C(treatment) + C(gender) + C(treatment):C(gender)', data=df).fit()

# 执行多因素方差分析
anova_result = anova_lm(model)

# 输出ANOVA结果
print("ANOVA结果：")
print(anova_result)

# 进行事后比较：Tukey HSD
# 将 treatment 和 gender 组合为一个分组变量
df['group'] = df['treatment'] + '_' + df['gender']

# 进行 Tukey HSD 检验
tukey_result = pairwise_tukeyhsd(df['score'], df['group'], alpha=0.05)

# 输出Tukey HSD检验结果
print("\nTukey HSD Post-Hoc Test Results:")
print(tukey_result)

### 代码4.1

In [None]:
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import matplotlib

# 设置中文字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 设置为常见的中文字体 SimHei


# 模拟被试的反应时间数据（单位：秒）
reaction_times = np.array([0.8, 1.2, 0.9, 1.5, 1.1, 0.7, 1.3, 1.0, 1.4, 0.6])

# 模拟被试的正确率数据（范围：0 - 1）
accuracies = np.array([0.8, 0.90, 0.85, 0.93, 0.89, 0.62, 0.95, 0.90, 0.96, 0.65])

# 计算皮尔逊相关系数和 p 值
correlation, p_value = pearsonr(reaction_times, accuracies)

# 打印结果
print(f"皮尔逊相关系数: {correlation:.3f}")
print(f"p 值: {p_value:.3f}")

# 判断相关性是否显著
if p_value < 0.05:
    print("反应时间和正确率之间的相关性是显著的。")
else:
    print("反应时间和正确率之间的相关性不显著。")

# 绘制散点图
plt.scatter(reaction_times, accuracies)
plt.title('反应时间与正确率的相关性', fontsize=16)
plt.xlabel('反应时间 (秒)')
plt.ylabel('正确率')


# 保存图像
plt.savefig('corr.tiff', dpi=300)
plt.show()

### 代码4.2

In [None]:
import numpy as np
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import matplotlib

# 设置中文字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 设置为常见的中文字体 SimHei
# 模拟两组数据
# 模拟学生的考试成绩
grades = np.array([85, 90, 78, 92, 88, 75, 80, 87, 91, 79])
# 模拟学生的学习时间（小时）
study_hours = np.array([10, 12, 8, 13, 11, 7, 9, 10.5, 12.5, 8.5])

# 计算斯皮尔曼等级相关系数和 p 值
correlation, p_value = spearmanr(grades, study_hours)

# 打印结果
print(f"斯皮尔曼等级相关系数: {correlation:.3f}")
print(f"p 值: {p_value:.3f}")

# 判断相关性是否显著
if p_value < 0.05:
    print("成绩和学习时间之间的相关性是显著的。")
else:
    print("成绩和学习时间之间的相关性不显著。")

# 绘制散点图来直观展示数据关系
plt.scatter(grades, study_hours)
plt.xlabel('考试成绩')
plt.ylabel('学习时间 (小时)')
plt.title('考试成绩与学习时间的关系')
plt.savefig('spearman.tiff',dpi=300)
plt.show()

### 代码5.1

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from matplotlib import font_manager

# 确保字体文件路径正确，通常可以选择 不同字体文件
font_path = "C:/Windows/Fonts/times.ttf"  # 根据你的系统修改路径

# 设置字体
prop = font_manager.FontProperties(fname=font_path)

# 模拟数据：学习时间（小时）与考试成绩（分数）
study_hours = np.array([2, 2, 5, 6, 10, 4, 6, 8, 9, 4])
exam_scores = np.array([45, 50, 65, 80, 95, 55, 70, 85, 90, 70])

# 使用scipy的linregress计算回归参数
slope, intercept, r_value, p_value, std_err = stats.linregress(study_hours, exam_scores)

# 绘制数据点与拟合的回归线
plt.scatter(study_hours, exam_scores, color='blue', label='Data points')
plt.plot(study_hours, slope * study_hours + intercept, color='red', label='Fitted line')
plt.title('Simple Linear Regression', fontsize=14, fontproperties=prop)
plt.xlabel('Study Hours', fontsize=12, fontproperties=prop)
plt.ylabel('Exam Scores', fontsize=12, fontproperties=prop)
plt.legend(prop=prop)

# 保存为TIFF文件，分辨率设置为300
plt.savefig('corr.tiff', dpi=300)

# 显示图形
plt.show()


### 代码5.2

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

# 模拟数据：广告花费、社交媒体影响与销售额之间的关系
ad_spending = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).reshape(-1, 1)  # 广告花费
social_media = np.array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14]).reshape(-1, 1)  # 社交媒体影响
sales = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95, 105])  # 销售额

# 合并特征
X = np.concatenate([ad_spending, social_media], axis=1)

# 创建并拟合模型
model = LinearRegression()
model.fit(X, sales)

# 输出模型参数
print(f"Intercept: {model.intercept_}")
print(f"Coefficients: {model.coef_}")

### 代码5.3

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# 模拟数据：广告花费与购买决策之间的关系
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).reshape(-1, 1)  # 广告花费
y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1])  # 购买决策（0: 未购买，1: 购买）

# 创建并拟合逻辑回归模型
logreg_model = LogisticRegression()
logreg_model.fit(X, y)

# 输出逻辑回归模型的系数和截距
print(f"Intercept: {logreg_model.intercept_}")
print(f"Coefficients: {logreg_model.coef_}")