## Get final dataframe for excel file main page

In [9]:
import numpy as np
import os
import pandas as pd

NA_list = ['NA1', 'NA2']
for NA in NA_list:
    base_dir = f'./results_{NA}'
    results_df = pd.DataFrame(columns=[
        'auc_score', 'acc_score', 'fpr_score', 'precision_score',
        'auc_score_original', 'acc_score_original', 'fpr_score_original', 'precision_score_original',
        'Are NA1 NA2 Different?', "Cohen's d (NA1 vs NA2)", f"Cohen's d ({NA} vs Response)", f"Cohen's d ({NA} vs Problem)",
        "Cohen's d (Response vs Problem)", "Tukey_HSD"
    ])

    for subdir, dirs, files in os.walk(base_dir):
        results_found = False
        stats_found = False
        folder_name = os.path.basename(subdir)
        if folder_name not in results_df.index:
            results_df.loc[folder_name] = pd.Series({"auc_score": "NA", "acc_score": "NA", "fpr_score": "NA", "precision_score": "NA", 
                                                    "auc_score_original": "NA", "acc_score_original": "NA", "fpr_score_original": "NA", "precision_score_original": "NA",
                                                    'Are NA1 NA2 Different?': "NA", "Cohen's d (NA1 vs NA2)": "NA", f"Cohen's d ({NA} vs Response)": "NA", f"Cohen's d ({NA} vs Problem)": "NA",
        "Cohen's d (Response vs Problem)": "NA", "Tukey_HSD": "NA"})
        for file in files:
            try:

                if file.startswith('best_params'):
                    full_file_path = os.path.join(subdir, file)
                    with open(full_file_path, 'r') as f:
                        lines = f.readlines()
                        if len(lines) > 5:  # Ensure there are enough lines to process
                            try:
                                # Attempt to extract and process AUC scores
                                auc_scores = np.array([float(x.strip()) for x in lines[2].split(': ')[1].replace('[', '').replace(']', '').split(',')]) * 100
                                acc_scores = np.array([float(x.strip()) for x in lines[3].split(': ')[1].replace('[', '').replace(']', '').split(',')]) * 100
                                fpr_scores = np.array([float(x.strip()) for x in lines[4].split(': ')[1].replace('[', '').replace(']', '').split(',')]) * 100
                                precision_scores = np.array([float(x.strip()) for x in lines[5].split(': ')[1].replace('[', '').replace(']', '').split(',')]) * 100

                                # Calculate mean and standard deviation and update the DataFrame
                                results_df.loc[folder_name, 'auc_score'] = f"{np.mean(auc_scores):.2f} ± {np.std(auc_scores):.2f}"
                                results_df.loc[folder_name, 'acc_score'] = f"{np.mean(acc_scores):.2f} ± {np.std(acc_scores):.2f}"
                                results_df.loc[folder_name, 'fpr_score'] = f"{np.mean(fpr_scores):.2f} ± {np.std(fpr_scores):.2f}"
                                results_df.loc[folder_name, 'precision_score'] = f"{np.mean(precision_scores):.2f} ± {np.std(precision_scores):.2f}"
                                results_found = True
                            except Exception as e:
                                print(f"Error processing file {file} in {subdir}: {e}")
                        else:
                            print(f"Insufficient data in file {file} in {subdir}")
                            

                elif 'Original_results' in file:
                    full_file_path = os.path.join(subdir, file)
                    with open(full_file_path, 'r') as f:
                        lines = f.readlines()
                        if len(lines) > 4:  # Ensure there are enough lines to process
                            try:
                                auc_scores_original = np.array([float(x.strip()) for x in lines[1].strip().split(': ')[1].replace('[', '').replace(']', '').split(',')])*100
                                acc_scores_original = np.array([float(x.strip()) for x in lines[2].strip().split(': ')[1].replace('[', '').replace(']', '').split(',')])*100
                                fpr_scores_original = np.array([float(x.strip()) for x in lines[3].strip().split(': ')[1].replace('[', '').replace(']', '').split(',')])*100
                                precision_scores_original = np.array([float(x.strip()) for x in lines[4].strip().split(': ')[1].replace('[', '').replace(']', '').split(',')])*100
                                
                                results_df.loc[folder_name, 'auc_score_original'] = f"{np.mean(auc_scores_original):.2f} ± {np.std(auc_scores_original):.2f}"
                                results_df.loc[folder_name, 'acc_score_original'] = f"{np.mean(acc_scores_original):.2f} ± {np.std(acc_scores_original):.2f}"
                                results_df.loc[folder_name, 'fpr_score_original'] = f"{np.mean(fpr_scores_original):.2f} ± {np.std(fpr_scores_original):.2f}"
                                results_df.loc[folder_name, 'precision_score_original'] = f"{np.mean(precision_scores_original):.2f} ± {np.std(precision_scores_original):.2f}"
                                results_found = True
                            except Exception as e:
                                print(f"Error processing file {file} in {subdir}: {e}")
                                # results_df.loc[folder_name][['auc_score_original','acc_score_original','fpr_score_original','precision_score_original']] = ["NA"] * 4
                        else:
                            print(f"Insufficient data in file {file} in {subdir}")
                            # results_df.loc[folder_name] = ["NA"] * len(results_df.columns) 
                elif 'info_comparison' in file:
                    full_file_path = os.path.join(subdir, file)
                    info_df = pd.read_csv(full_file_path)
                    results_df.loc[folder_name, 'Are NA1 NA2 Different?'] = info_df['Are NA1 NA2 Different?'].iloc[0]
                    results_df.loc[folder_name, f"Cohen's d (NA1 vs NA2)"] = info_df["Cohen's d (NA1 vs NA2)"].iloc[0]
                    results_df.loc[folder_name, f"Cohen's d ({NA} vs Response)"] = info_df[f"Cohen's d ({NA} vs Response)"].iloc[0]
                    results_df.loc[folder_name, f"Cohen's d ({NA} vs Problem)"] = info_df[f"Cohen's d ({NA} vs Problem)"].iloc[0]
                    results_df.loc[folder_name, f"Cohen's d (Response vs Problem)"] = info_df[f"Cohen's d (Response vs Problem)"].iloc[0]
                    results_df.loc[folder_name, "Tukey_HSD"] = info_df["Tukey_HSD"].iloc[0]
                    stats_found = True
                else:
                    continue
            except:
                continue
    results_df.to_csv(f'./{NA}_11results.csv', encoding='utf-8-sig', index=True)


In [10]:
import pandas as pd
df_NA1= pd.read_csv("./NA1_11results.csv", encoding='utf-8') 
df_NA1 = df_NA1.iloc[1:,:]
df_NA2= pd.read_csv("./NA2_11results.csv", encoding='utf-8')
df_NA2 = df_NA2.iloc[1:,:]

In [11]:

def parse_mean_std(value):
    mean, std = value.split('±')
    return float(mean.strip()), float(std.strip())

criteria_list = ['auc', 'acc', 'fpr', 'precision'] 
results = {criteria: [] for criteria in criteria_list}

for index, row in df_NA1.iterrows():
    for criteria in criteria_list:
        mean, std = parse_mean_std(row[f'{criteria}_score'])
        original_mean, original_std = parse_mean_std(row[f'{criteria}_score_original'])
        abs_diff = abs(mean - original_mean)
        sum_std = std + original_std
        result = abs_diff - sum_std
  
        comparison_result = 'NE' if result > 0 else 'E'
        results[criteria].append(comparison_result)  # Store only the categorical result

for criteria in criteria_list:
    df_NA1[f'{criteria}_condition_met'] = results[criteria]
df_NA1.to_csv(f'./NA1_11results_condition.csv', encoding='utf-8-sig', index=False)


In [12]:
criteria_list = ['auc', 'acc', 'fpr', 'precision'] 
results = {criteria: [] for criteria in criteria_list}

for index, row in df_NA2.iterrows():
    for criteria in criteria_list:
        print('index:', index, row, criteria)
        mean, std = parse_mean_std(row[f'{criteria}_score'])
        original_mean, original_std = parse_mean_std(row[f'{criteria}_score_original'])
        abs_diff = abs(mean - original_mean)
        sum_std = std + original_std
        result = abs_diff - sum_std
  
        comparison_result = 'NE' if result > 0 else 'E'
        results[criteria].append(comparison_result)  # Store only the categorical result

for criteria in criteria_list:
    df_NA2[f'{criteria}_condition_met'] = results[criteria]
df_NA2.to_csv(f'./NA2_11results_condition.csv', encoding='utf-8-sig', index=False)


index: 1 Unnamed: 0                                                                        경제
auc_score                                                               87.46 ± 2.16
acc_score                                                               78.05 ± 3.08
fpr_score                                                               14.63 ± 6.19
precision_score                                                         76.07 ± 4.19
auc_score_original                                                      69.39 ± 4.19
acc_score_original                                                      64.88 ± 4.53
fpr_score_original                                                     44.88 ± 11.17
precision_score_original                                                70.40 ± 9.75
Are NA1 NA2 Different?                                                         False
Cohen's d (NA1 vs NA2)                                                      0.174876
Cohen's d (NA2 vs Response)                             

AttributeError: 'float' object has no attribute 'split'

## delete files from all folders

In [None]:
do not run this cell carelessly

In [None]:
# import os

# # Define base directories
# base_dirs = [
#     '/root/jw/EduCDM/examples/IRT/GD/results_NA1',
#     '/root/jw/EduCDM/examples/IRT/GD/results_NA2'
# ]

# for base_dir in base_dirs:
#     actual_folders = [folder for folder in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, folder))]
#     for subject in actual_folders:
#         subject_path = os.path.join(base_dir, subject)
#         for file_name in os.listdir(subject_path):
#             #if file.startswith('best_params_') and file.endswith('.txt'):
#             if file_name in ['NA2_info_comparison.csv', 'NA1_info_comparison.csv']:
#                 file_path = os.path.join(subject_path, file_name)
#                 # Delete the file
#                 os.remove(file_path)
#                 print(f"Deleted file: {file_path}")
