## 1.数据预处理

## 1.1患者组

In [1]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Read the Excel file

df = pd.read_excel('./rawdata(EF+deo).xlsx', sheet_name='160病人')

print(df.columns)
# Specify the behavior variables
behavior_variables = [ 'Stroop_incongruent_rt',
       'Stroop_interference effect_rt', 'Nogo_acc',
       'Switch_cost', 'RM-1,750_acc', 'RM-750_acc', 'DSBT_Span']
# Extract the behavior variables data from the DataFrame
behavior_data = df[behavior_variables]

# Check for missing values in the behavior variables before imputation
missing_values_before = behavior_data.isna().sum()
print("Missing values before imputation:")
print(missing_values_before)

# Use IterativeImputer (MICE) for multiple imputation with adjusted parameters
imputer = IterativeImputer(random_state=42, 
                           max_iter=10,  # Increase the maximum number of iterations 
                           tol=1e-3,     # Adjust the tolerance for convergence
                           verbose=2)    # Print out progress during imputation
imputed_behavior_data = imputer.fit_transform(behavior_data)

# Convert the imputed data back to a DataFrame
behavior_df = pd.DataFrame(imputed_behavior_data, columns=behavior_variables)

# Add the imputed behavior data to the original DataFrame
for variable in behavior_variables:
    df[variable] = behavior_df[variable]

# Check for missing values in the updated DataFrame
missing_values_after = df[behavior_variables].isna().sum()
print("Missing values after imputation:")
print(missing_values_after)

# Save the updated DataFrame to a new Excel file  
df.to_excel('./table/预处理后的rawdata-160名患者.xlsx', index=False)

Index(['RPM', 'Age', 'Education_years', 'BMI', 'SES', 'Gender', 'Ethnic',
       'Residence', 'Only_child', 'Smoking_status', 'Alcohol_consumption',
       'Employed', 'Marital_status', 'Stroop_incongruent_rt',
       'Stroop_interference effect_rt', 'Nogo_acc', 'Switch_cost',
       'RM-1,750_acc', 'RM-750_acc', 'DSBT_Span'],
      dtype='object')
Missing values before imputation:
Stroop_incongruent_rt            0
Stroop_interference effect_rt    0
Nogo_acc                         0
Switch_cost                      0
RM-1,750_acc                     0
RM-750_acc                       0
DSBT_Span                        0
dtype: int64
[IterativeImputer] Completing matrix with shape (114, 7)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.00
[IterativeImputer] Change: 0.0, scaled tolerance: 1.7994943074003795 
[IterativeImputer] Early stopping criterion reached.
Missing values after imputation:
Stroop_incongruent_rt            0
Stroop_interference effect_rt    0
Nogo_ac

## 1.2对照组

In [2]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Read the Excel file
df = pd.read_excel('./rawdata(EF+deo).xlsx', sheet_name='167健康')

# Specify the behavior variables
behavior_variables =  [ 'Stroop_incongruent_rt',
       'Stroop_interference effect_rt', 'Nogo_acc',
       'Switch_cost', 'RM-1,750_acc', 'RM-750_acc', 'DSBT_Span']

# Extract the behavior variables data from the DataFrame
behavior_data = df[behavior_variables]


# Check for missing values in the behavior variables before imputation
missing_values_before = behavior_data.isna().sum()
print("Missing values before imputation:")
print(missing_values_before)

# Use IterativeImputer (MICE) for multiple imputation with adjusted parameters
imputer = IterativeImputer(random_state=42, 
                           max_iter=10,  # Increase the maximum number of iterations
                           tol=1e-3,     # Adjust the tolerance for convergence
                           verbose=2)    # Print out progress during imputation
imputed_behavior_data = imputer.fit_transform(behavior_data)

# Convert the imputed data back to a DataFrame
behavior_df = pd.DataFrame(imputed_behavior_data, columns=behavior_variables)

# Add the imputed behavior data to the original DataFrame
for variable in behavior_variables:
    df[variable] = behavior_df[variable]

# Check for missing values in the updated DataFrame
missing_values = df[behavior_variables].isna().sum()
print("Missing values after imputation:")
print(missing_values)

df.to_excel('./table/预处理后的rawdata-167健康.xlsx', index=False)

Missing values before imputation:
Stroop_incongruent_rt            0
Stroop_interference effect_rt    0
Nogo_acc                         0
Switch_cost                      0
RM-1,750_acc                     0
RM-750_acc                       0
DSBT_Span                        0
dtype: int64
[IterativeImputer] Completing matrix with shape (114, 7)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.00
[IterativeImputer] Change: 0.0, scaled tolerance: 7.793354957160343 
[IterativeImputer] Early stopping criterion reached.
Missing values after imputation:
Stroop_incongruent_rt            0
Stroop_interference effect_rt    0
Nogo_acc                         0
Switch_cost                      0
RM-1,750_acc                     0
RM-750_acc                       0
DSBT_Span                        0
dtype: int64


In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# 读取Excel文件
patients_df = pd.read_excel('./table/预处理后的rawdata-160名患者.xlsx')
control_df = pd.read_excel('./table/预处理后的rawdata-167健康.xlsx')
# 定义变量
continuous_variables = ['Age','Education_years']
categorical_variables = ['Gender']
behavior_variables = ['Stroop_incongruent_rt', 'Stroop_interference effect_rt', 'Nogo_acc', 'Switch_cost', 'RM-1,750_acc', 'RM-750_acc', 'DSBT_Span']

# 对反应时变量取倒数
patients_df['Stroop_incongruent_rt'] = 1 / patients_df['Stroop_incongruent_rt']
patients_df['Switch_cost'] = 1 / patients_df['Switch_cost']
control_df['Stroop_incongruent_rt'] = 1 / control_df['Stroop_incongruent_rt']
control_df['Switch_cost'] = 1 / control_df['Switch_cost']

# One-hot encode categorical variables
def one_hot_encode(df, categorical_vars):
    for var in categorical_vars:
        dummies = pd.get_dummies(df[var], prefix=var, drop_first=True)
        df = pd.concat([df, dummies], axis=1)
        df.drop(var, axis=1, inplace=True)
    return df

patients_df = one_hot_encode(patients_df, categorical_variables)
control_df = one_hot_encode(control_df, categorical_variables)


# 标准化连续变量
scaler = StandardScaler()
control_df[continuous_variables] = scaler.fit_transform(control_df[continuous_variables])
patients_df[continuous_variables] = scaler.transform(patients_df[continuous_variables])

# 从对照组DataFrame中提取行为变量数据
control_data = control_df[behavior_variables]

# 使用对照组数据拟合StandardScaler
scaler = StandardScaler()
scaler.fit(control_data)

# 使用对照组的标准化参数标准化患者组数据
standardized_patient_data = scaler.transform(patients_df[behavior_variables])
standardized_patient_df = pd.DataFrame(standardized_patient_data, columns=behavior_variables)

# 从数据框中获取所有人口学变量（已独热编码和标准化）
demographic_variables = [col for col in control_df.columns if col not in behavior_variables]

# 使用对照组数据拟合回归模型校正人口学效应
corrected_patient_data = pd.DataFrame()

for variable in behavior_variables:
    model = LinearRegression()
    model.fit(control_df[demographic_variables], control_df[variable])
    correction = model.predict(patients_df[demographic_variables])
    corrected_values = standardized_patient_df[variable] - correction
    corrected_patient_data[variable] = corrected_values

# 将校正后并标准化后的数据保存为新的Excel文件
corrected_patient_df = pd.DataFrame(corrected_patient_data)
corrected_patient_df.to_excel('./table/校正并标准化后的患者行为变量数据.xlsx', index=False)

# 打印校正并标准化后的DataFrame
print(corrected_patient_df.head())

print("校正并标准化完成，数据已保存到'./table/校正并标准化后的患者行为变量数据.xlsx'.")

# 计算校正并标准化后的均值和标准差
corrected_patient_means = corrected_patient_df.mean().round(2)
corrected_patient_stds = corrected_patient_df.std().round(2)

# 创建一个DataFrame来存储结果，以“均值±标准差”格式显示
summary_df = pd.DataFrame({
    'Corrected and Standardized Patient': corrected_patient_means.astype(str) + ' ± ' + corrected_patient_stds.astype(str)
}, index=behavior_variables)

# 打印结果
print(summary_df)

# 保存结果为Excel文件
summary_df.to_excel('./table/校正并标准化后的患者行为变量均值和标准差.xlsx')

print("校正并标准化后的均值和标准差已保存到'./table/校正并标准化后的行为变量均值和标准差.xlsx'.")

   Stroop_incongruent_rt  Stroop_interference effect_rt  Nogo_acc  \
0               1.061265                      -8.630725 -0.356580   
1              -0.647744                      50.981462 -1.417566   
2              -0.686216                      22.200395 -0.662635   
3              -0.462113                      -5.064680  0.433945   
4              -0.305362                      82.170392 -1.291484   

   Switch_cost  RM-1,750_acc  RM-750_acc  DSBT_Span  
0     0.129795     -0.573291   -0.693876  -3.811639  
1     3.430318     -1.191951   -1.524353  -5.719753  
2     0.232100     -0.823193   -0.924130  -4.602046  
3     0.167289     -1.148885   -0.815222  -4.589600  
4     0.163447     -0.183362   -0.236175  -5.412409  
校正并标准化完成，数据已保存到'./table/校正并标准化后的患者行为变量数据.xlsx'.
                              Corrected and Standardized Patient
Stroop_incongruent_rt                                0.04 ± 0.99
Stroop_interference effect_rt                      23.77 ± 43.83
Nogo_acc          

# 数据分析全部完整

In [4]:
print('数据分析全部完成！')

数据分析全部完成！
