In [1]:
#导入所需的模块
import pandas as pd
import numpy as np
import pyDOE2 as doe
import statsmodels.api as sm

#读取数据集
df = pd.read_csv("dataset/games.csv")

#定义因素和水平
factors = ["mac", "linux", "price_final"] #三个因素 + 一个空列
levels = [2, 2, 2] #每个因素有两个水平，0或1

#生成正交表
design = doe.ff2n(len(factors)) #生成一个L8(2^3)的正交表，只包含-1和1两个水平
print(design)

[[-1. -1. -1.]
 [ 1. -1. -1.]
 [-1.  1. -1.]
 [ 1.  1. -1.]
 [-1. -1.  1.]
 [ 1. -1.  1.]
 [-1.  1.  1.]
 [ 1.  1.  1.]]


In [2]:
df.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
0,10090,Call of Duty: World at War,2008-11-18,True,False,False,Very Positive,92,37039,19.99,19.99,0.0,True
1,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True
2,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True
3,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True
4,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True


In [3]:
#将-1和1转换为0和1
design = (design + 1) / 2 #将-1转换为0，将1转换为1
print(design)

[[0. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 1. 0.]
 [0. 0. 1.]
 [1. 0. 1.]
 [0. 1. 1.]
 [1. 1. 1.]]


In [4]:
#添加因素名称
design = pd.DataFrame(design, columns=factors) #将数组转换为数据框，并添加列名
print(design)

   mac  linux  price_final
0  0.0    0.0          0.0
1  1.0    0.0          0.0
2  0.0    1.0          0.0
3  1.0    1.0          0.0
4  0.0    0.0          1.0
5  1.0    0.0          1.0
6  0.0    1.0          1.0
7  1.0    1.0          1.0


In [5]:
#实施试验
results = []  # 用来存储试验结果
sample_data_list = []  # 存储每次采样的数据
for i in range(len(design)): #对每一行的组合进行试验
    row = design.iloc[i] #取出第i行的组合
    if row["price_final"] == 0: #如果水平为0，表示价格小于10
        filtered_data = df[(df["mac"]==row["mac"]) & (df["linux"]==row["linux"]) & (df["price_final"]<30)] #根据条件过滤数据集
    else: #如果水平为1，表示价格大于等于10
        filtered_data = df[(df["mac"]==row["mac"]) & (df["linux"]==row["linux"]) & (df["price_final"]>=30)] #根据条件过滤数据集
    sample_data = filtered_data.sample(20) #从过滤后的数据集中随机抽取20个样本
    sample_data_list.append(sample_data)
    mean_positive_ratio = sample_data["positive_ratio"].mean() #计算样本的positive_ratio的平均值作为试验结果
    results.append(mean_positive_ratio) #将试验结果添加到列表中
concat_df = pd.concat(sample_data_list, axis=0, join='outer')

In [6]:
results

[72.1, 82.15, 79.5, 72.1, 77.65, 84.95, 83.85, 77.05]

In [7]:
sample_data_list_temp = []
for i in sample_data_list:
    sample_data_list_temp.append(i["positive_ratio"])
sample_data_list = sample_data_list_temp

In [8]:
sample_data_list

[24513     70
 41917     61
 32338    100
 47508     83
 41673     29
 38391     36
 43221     71
 26992     75
 8378      95
 23910     51
 26433     48
 24065     83
 22961     93
 30222     53
 37101     57
 43110     91
 11818     93
 30491     86
 22021     79
 18228     88
 Name: positive_ratio, dtype: int64,
 36183     86
 16608     91
 2381      89
 39907     97
 27051     80
 13835     84
 1732      89
 20376     89
 20548     91
 17626     66
 27665     89
 27421     73
 22277     89
 23973     70
 18364     91
 8345      82
 33864    100
 21123     78
 30587     45
 39085     64
 Name: positive_ratio, dtype: int64,
 29622     71
 42195     66
 7625      88
 16182     61
 32203     75
 32968     78
 44254     72
 21502     65
 24829     92
 5439      96
 44235     66
 48490     90
 14708     97
 36663    100
 32881     69
 24489    100
 18260     90
 43198     68
 42171     83
 4349      63
 Name: positive_ratio, dtype: int64,
 45861     89
 36021     58
 19822     84
 47491 

In [9]:
a = sample_data_list[0]

In [10]:
a.mean()

72.1

In [32]:
# 检验
dfcheap = df[df["price_final"]<50]

In [33]:
dfexpensive = df[df["price_final"]>50]

In [34]:
print(dfcheap["positive_ratio"].mean())
print(dfexpensive["positive_ratio"].mean())

76.92442125315775
72.25108225108225
