In [4]:
import os
import pandas as pd
import re

In [5]:
file_path = [i for i in os.listdir('.') if i.endswith('.md') and not i.startswith('8')]
file_path

['70b_prompt_chat_sft_x1xxx-BFI.md',
 '70b_prompt_chat_dpo_xx1xx-IPIP-NEO.md',
 '70b_prompt_chat_sft_x0xxx-BFI.md',
 '70b_prompt_chat_dpo_0xxxx-IPIP-NEO.md',
 'gpt-4o-mini_prompt_v1_xxx0x-BFI.md',
 'gpt-4o-mini_prompt_chat_xx1xx-IPIP-NEO.md',
 'gpt-4o-mini_prompt_v1_xx1xx-IPIP-NEO.md',
 'gpt-4o-mini_prompt_chat_0xxxx-IPIP-NEO.md',
 'gpt-4o-mini_prompt_v1_0xxxx-IPIP-NEO.md',
 '70b_prompt_chat_dpo_1xxxx-IPIP-NEO.md',
 'gpt-4o-mini_prompt_v1_xxxx0-IPIP-NEO.md',
 'gpt-4o-mini_prompt_chat_xxxx0-IPIP-NEO.md',
 'gpt-4o-mini_prompt_v1_1xxxx-IPIP-NEO.md',
 '70b_prompt_chat_dpo_xxxx0-IPIP-NEO.md',
 'gpt-4o-mini_prompt_v1_0xxxx-BFI.md',
 'gpt-4o-mini_prompt_chat_1xxxx-IPIP-NEO.md',
 '70b_prompt_chat_dpo_0xxxx-BFI.md',
 'gpt-4o-mini_prompt_chat_1xxxx-BFI.md',
 'gpt-4o-mini_prompt_chat_xx0xx-BFI.md',
 'gpt-4o-mini_prompt_chat_xxxx0-BFI.md',
 'gpt-4o-mini_prompt_chat_x0xxx-BFI.md',
 'gpt-4o-mini_prompt_chat_x1xxx-BFI.md',
 '70b_prompt_chat_sft_1xxxx-BFI.md',
 '70b_prompt_chat_dpo_x1xxx-IPIP-NEO.md',

In [19]:
def extract_personality_data(file_path):
    # Read first 10 lines of the file
    with open(file_path, 'r') as file:
        lines = [next(file) for _ in range(10)]
    
    # Initialize lists to store data
    traits = []
    llama_scores = []
    crowd_scores = []
    
    # Regular expression pattern to match scores
    pattern = r"(\d+\.\d+)\s*\$\\pm\$\s*(\d+\.\d+)"
    
    # Process each line
    for line in lines:
        if '|' in line:  # Skip lines without |
            columns = line.split('|')
            if len(columns) >= 4:  # Ensure line has enough columns
                trait = columns[1].strip()
                if trait not in ['Category', '---']:  # Skip header and separator rows
                    traits.append(trait)
                    
                    # Extract llama scores
                    llama_match = re.search(pattern, columns[2])
                    if llama_match:
                        mean, std = llama_match.groups()
                        llama_scores.append(f"{mean} $\pm$ {std}")
                    
                    # Extract crowd scores
                    crowd_match = re.search(pattern, columns[3])
                    if crowd_match:
                        mean, std = crowd_match.groups()
                        crowd_scores.append(f"{mean} ± {std}")
    # print(file_path)
    # print(traits[1:])
    # print(llama_scores)
    # # # Create DataFrame
    # df = pd.DataFrame({
    #     'Category': traits,
    #     'llama3_70b (n = 5)': llama_scores,
    #     # 'Crowd (n = 6076)': crowd_scores
    # })
    
    return traits[1:], llama_scores

def parse_filenames(filenames):
    data = []
    
    for filename in filenames:
        # Extract model
        model_match = re.match(r'(70b|gpt-4o-mini)_', filename)
        model = model_match.group(1) if model_match else None
        
        # Extract prompt pattern part
        if model == '70b':
            # 匹配 prompt_chat_sft_ 或 prompt_chat_dpo_ 加上后面的 x/0/1 序列
            prompt_match = re.search(r'(prompt_chat_(?:sft|dpo)_[x01]+)', filename)
        else:
            # 匹配 prompt_v1 加上后面的序列
            prompt_match = re.search(r'(prompt_(?:v1|chat)_[x01]+)', filename)
        prompt = prompt_match.group(1) if prompt_match else None
        
        # Extract training type
        training_type = 'sft' if 'sft' in filename else 'dpo' if 'dpo' in filename else None
        
        # Extract personality test type
        personality_test = 'BFI' if 'BFI' in filename else 'IPIP-NEO' if 'IPIP-NEO' in filename else None
        
        traits, scores = extract_personality_data(filename)
        
        base_info = {
            'model': model,
            'train': training_type,
            'mode_mode': prompt,
            'questionnaire': personality_test,
        }
        
        trait_scores = {}
        for i in range(len(traits)):
            trait_scores[traits[i]] = scores[i]
            
        combined_dict = base_info.copy()
        combined_dict.update(trait_scores)
        
        data.append(combined_dict)
    
    df = pd.DataFrame(data)
    return df

In [20]:
df = parse_filenames(file_path)
df
# 0 is high, 1 is low

Unnamed: 0,model,train,mode_mode,questionnaire,Extraversion,Agreeableness,Conscientiousness,Neuroticism,Openness
0,70b,sft,prompt_chat_sft_x1xxx,BFI,3.7 $\pm$ 0.2,3.1 $\pm$ 0.3,2.0 $\pm$ 0.0,2.8 $\pm$ 0.1,3.4 $\pm$ 0.1
1,70b,dpo,prompt_chat_dpo_xx1xx,IPIP-NEO,2.5 $\pm$ 0.1,3.9 $\pm$ 0.0,3.6 $\pm$ 0.0,2.4 $\pm$ 0.1,3.0 $\pm$ 0.0
2,70b,sft,prompt_chat_sft_x0xxx,BFI,3.5 $\pm$ 0.1,4.0 $\pm$ 0.1,3.9 $\pm$ 0.2,2.4 $\pm$ 0.2,3.8 $\pm$ 0.2
3,70b,dpo,prompt_chat_dpo_0xxxx,IPIP-NEO,4.0 $\pm$ 0.1,3.8 $\pm$ 0.1,3.5 $\pm$ 0.1,2.1 $\pm$ 0.1,3.6 $\pm$ 0.1
4,gpt-4o-mini,,prompt_v1_xxx0x,BFI,3.7 $\pm$ 0.1,4.4 $\pm$ 0.0,4.0 $\pm$ 0.1,2.7 $\pm$ 0.1,4.4 $\pm$ 0.1
...,...,...,...,...,...,...,...,...,...
57,gpt-4o-mini,,prompt_chat_xxx0x,BFI,3.7 $\pm$ 0.1,4.4 $\pm$ 0.0,3.9 $\pm$ 0.2,2.7 $\pm$ 0.1,4.4 $\pm$ 0.1
58,70b,dpo,prompt_chat_dpo_xxx1x,IPIP-NEO,3.3 $\pm$ 0.1,1.9 $\pm$ 0.1,2.6 $\pm$ 0.0,2.7 $\pm$ 0.1,2.7 $\pm$ 0.1
59,70b,sft,prompt_chat_sft_x1xxx,IPIP-NEO,3.5 $\pm$ 0.1,2.6 $\pm$ 0.2,2.0 $\pm$ 0.1,3.1 $\pm$ 0.1,3.1 $\pm$ 0.1
60,gpt-4o-mini,,prompt_chat_xxxx1,IPIP-NEO,3.6 $\pm$ 0.1,3.9 $\pm$ 0.0,3.7 $\pm$ 0.1,2.9 $\pm$ 0.1,3.8 $\pm$ 0.1


In [21]:
df = df.reindex(columns=['model', 'train', 'mode_mode', 'questionnaire', 'Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism'])
df.head()

Unnamed: 0,model,train,mode_mode,questionnaire,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism
0,70b,sft,prompt_chat_sft_x1xxx,BFI,3.4 $\pm$ 0.1,2.0 $\pm$ 0.0,3.7 $\pm$ 0.2,3.1 $\pm$ 0.3,2.8 $\pm$ 0.1
1,70b,dpo,prompt_chat_dpo_xx1xx,IPIP-NEO,3.0 $\pm$ 0.0,3.6 $\pm$ 0.0,2.5 $\pm$ 0.1,3.9 $\pm$ 0.0,2.4 $\pm$ 0.1
2,70b,sft,prompt_chat_sft_x0xxx,BFI,3.8 $\pm$ 0.2,3.9 $\pm$ 0.2,3.5 $\pm$ 0.1,4.0 $\pm$ 0.1,2.4 $\pm$ 0.2
3,70b,dpo,prompt_chat_dpo_0xxxx,IPIP-NEO,3.6 $\pm$ 0.1,3.5 $\pm$ 0.1,4.0 $\pm$ 0.1,3.8 $\pm$ 0.1,2.1 $\pm$ 0.1
4,gpt-4o-mini,,prompt_v1_xxx0x,BFI,4.4 $\pm$ 0.1,4.0 $\pm$ 0.1,3.7 $\pm$ 0.1,4.4 $\pm$ 0.0,2.7 $\pm$ 0.1


In [35]:
def find_non_x_index(string):
    for i, char in enumerate(string):
        if char != 'x':
            return i
    raise ValueError("No character other than 'x' found in the string.")

labels = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']

data = []
for questionnaire in ['BFI', 'IPIP-NEO']:
    for model, model_abbr in [['LLaMA3-70B', '70b'], ['GPT-4o-Mini', 'gpt-4o-mini']]:
        for method, method_abbr in [['Prompt V1', 'prompt_v1_'], ["Prompt Chat", 'prompt_chat_'], ['SFT', 'prompt_chat_sft_'], ['DPO', 'prompt_chat_dpo_']]:
            if method_abbr == "prompt_chat_" and model_abbr == "llama3_8b":
                continue
            row = [questionnaire, model, method]
            if method != "Direct":
                row_avg_high, row_std_high = 0, 0
                row_avg_low, row_std_low = 0, 0
                for mode in ["0xxxx", '1xxxx', 'x0xxx', 'x1xxx', 'xx0xx', 'xx1xx', 'xxx0x', 'xxx1x', 'xxxx0', 'xxxx1']:
                    label = labels[find_non_x_index(mode)]
                    val = df.loc[((df['questionnaire'] == questionnaire) & (df['mode_mode'] == f'{method_abbr}{mode}')) & (df['model'] == model_abbr)][[label]]
                    if val is None or val.empty:
                        continue
                    else:
                        val = val.values.tolist()[0][0]
                    row.append(val)
                    val_avg, val_std = val.split(r" $\pm$ ")
                    if "0" in mode:
                        row_avg_high += float(val_avg)
                        row_std_high += float(val_std)
                    elif "1" in mode:
                        row_avg_low += float(val_avg)
                        row_std_low += float(val_std)
                row.append(fr"{row_avg_high/5:.1f} $\pm$ {row_std_high/5:.1f}")
                row.append(fr"{row_avg_low/5:.1f} $\pm$ {row_std_low/5:.1f}")
            data.append(row)

In [49]:
df_1 = pd.DataFrame(data)
df_1.columns = ['Questionnaire', 'Model', 'Method'] + [f"{level} {label}" for label in labels for level in ['High', 'Low']] + ["High Average", "Low Average"]

In [50]:
df_1.dropna()

Unnamed: 0,Questionnaire,Model,Method,High Openness,Low Openness,High Conscientiousness,Low Conscientiousness,High Extraversion,Low Extraversion,High Agreeableness,Low Agreeableness,High Neuroticism,Low Neuroticism,High Average,Low Average
4,BFI,GPT-4o-Mini,Prompt V1,4.5 $\pm$ 0.1,4.4 $\pm$ 0.0,4.0 $\pm$ 0.1,4.0 $\pm$ 0.1,3.7 $\pm$ 0.1,3.7 $\pm$ 0.1,4.4 $\pm$ 0.0,4.4 $\pm$ 0.1,2.8 $\pm$ 0.1,2.6 $\pm$ 0.1,3.9 $\pm$ 0.1,3.8 $\pm$ 0.1
5,BFI,GPT-4o-Mini,Prompt Chat,4.5 $\pm$ 0.1,4.5 $\pm$ 0.1,4.0 $\pm$ 0.1,4.0 $\pm$ 0.1,3.7 $\pm$ 0.1,3.6 $\pm$ 0.1,4.4 $\pm$ 0.0,4.4 $\pm$ 0.0,2.7 $\pm$ 0.1,2.8 $\pm$ 0.1,3.9 $\pm$ 0.1,3.9 $\pm$ 0.1
11,IPIP-NEO,LLaMA3-70B,DPO,3.6 $\pm$ 0.1,2.5 $\pm$ 0.1,3.9 $\pm$ 0.1,2.1 $\pm$ 0.1,4.1 $\pm$ 0.0,2.5 $\pm$ 0.1,4.4 $\pm$ 0.0,1.9 $\pm$ 0.1,3.1 $\pm$ 0.0,2.1 $\pm$ 0.1,3.8 $\pm$ 0.0,2.2 $\pm$ 0.1
12,IPIP-NEO,GPT-4o-Mini,Prompt V1,3.8 $\pm$ 0.1,3.8 $\pm$ 0.1,3.7 $\pm$ 0.0,3.7 $\pm$ 0.1,3.6 $\pm$ 0.0,3.6 $\pm$ 0.0,3.9 $\pm$ 0.1,3.9 $\pm$ 0.1,2.9 $\pm$ 0.1,2.9 $\pm$ 0.0,3.6 $\pm$ 0.1,3.6 $\pm$ 0.1
13,IPIP-NEO,GPT-4o-Mini,Prompt Chat,3.8 $\pm$ 0.1,3.8 $\pm$ 0.1,3.7 $\pm$ 0.1,3.7 $\pm$ 0.1,3.6 $\pm$ 0.0,3.7 $\pm$ 0.1,3.9 $\pm$ 0.1,3.9 $\pm$ 0.1,2.9 $\pm$ 0.0,2.9 $\pm$ 0.1,3.6 $\pm$ 0.1,3.6 $\pm$ 0.1


In [51]:
df_2 = df_1.dropna()
df_2.to_latex()

'\\begin{tabular}{llllllllllllllll}\n\\toprule\n & Questionnaire & Model & Method & High Openness & Low Openness & High Conscientiousness & Low Conscientiousness & High Extraversion & Low Extraversion & High Agreeableness & Low Agreeableness & High Neuroticism & Low Neuroticism & High Average & Low Average \\\\\n\\midrule\n4 & BFI & GPT-4o-Mini & Prompt V1 & 4.5 $\\pm$ 0.1 & 4.4 $\\pm$ 0.0 & 4.0 $\\pm$ 0.1 & 4.0 $\\pm$ 0.1 & 3.7 $\\pm$ 0.1 & 3.7 $\\pm$ 0.1 & 4.4 $\\pm$ 0.0 & 4.4 $\\pm$ 0.1 & 2.8 $\\pm$ 0.1 & 2.6 $\\pm$ 0.1 & 3.9 $\\pm$ 0.1 & 3.8 $\\pm$ 0.1 \\\\\n5 & BFI & GPT-4o-Mini & Prompt Chat & 4.5 $\\pm$ 0.1 & 4.5 $\\pm$ 0.1 & 4.0 $\\pm$ 0.1 & 4.0 $\\pm$ 0.1 & 3.7 $\\pm$ 0.1 & 3.6 $\\pm$ 0.1 & 4.4 $\\pm$ 0.0 & 4.4 $\\pm$ 0.0 & 2.7 $\\pm$ 0.1 & 2.8 $\\pm$ 0.1 & 3.9 $\\pm$ 0.1 & 3.9 $\\pm$ 0.1 \\\\\n11 & IPIP-NEO & LLaMA3-70B & DPO & 3.6 $\\pm$ 0.1 & 2.5 $\\pm$ 0.1 & 3.9 $\\pm$ 0.1 & 2.1 $\\pm$ 0.1 & 4.1 $\\pm$ 0.0 & 2.5 $\\pm$ 0.1 & 4.4 $\\pm$ 0.0 & 1.9 $\\pm$ 0.1 & 3.1 $\\pm$ 0.0

In [52]:
df_3 = df_2.loc[df_2['Questionnaire'] == 'BFI'].drop(columns=['Questionnaire'])
df_3 = df_3.set_index(['Model', 'Method'])
print(df_3.to_latex(multirow=True, index=True, header=False))

\begin{tabular}{llllllllllllll}
\toprule
Model & Method &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{2}{*}{GPT-4o-Mini} & Prompt V1 & 4.5 $\pm$ 0.1 & 4.4 $\pm$ 0.0 & 4.0 $\pm$ 0.1 & 4.0 $\pm$ 0.1 & 3.7 $\pm$ 0.1 & 3.7 $\pm$ 0.1 & 4.4 $\pm$ 0.0 & 4.4 $\pm$ 0.1 & 2.8 $\pm$ 0.1 & 2.6 $\pm$ 0.1 & 3.9 $\pm$ 0.1 & 3.8 $\pm$ 0.1 \\
 & Prompt Chat & 4.5 $\pm$ 0.1 & 4.5 $\pm$ 0.1 & 4.0 $\pm$ 0.1 & 4.0 $\pm$ 0.1 & 3.7 $\pm$ 0.1 & 3.6 $\pm$ 0.1 & 4.4 $\pm$ 0.0 & 4.4 $\pm$ 0.0 & 2.7 $\pm$ 0.1 & 2.8 $\pm$ 0.1 & 3.9 $\pm$ 0.1 & 3.9 $\pm$ 0.1 \\
\cline{1-14}
\bottomrule
\end{tabular}



In [55]:
print(df_2.to_latex())

\begin{tabular}{llllllllllllllll}
\toprule
 & Questionnaire & Model & Method & High Openness & Low Openness & High Conscientiousness & Low Conscientiousness & High Extraversion & Low Extraversion & High Agreeableness & Low Agreeableness & High Neuroticism & Low Neuroticism & High Average & Low Average \\
\midrule
4 & BFI & GPT-4o-Mini & Prompt V1 & 4.5 $\pm$ 0.1 & 4.4 $\pm$ 0.0 & 4.0 $\pm$ 0.1 & 4.0 $\pm$ 0.1 & 3.7 $\pm$ 0.1 & 3.7 $\pm$ 0.1 & 4.4 $\pm$ 0.0 & 4.4 $\pm$ 0.1 & 2.8 $\pm$ 0.1 & 2.6 $\pm$ 0.1 & 3.9 $\pm$ 0.1 & 3.8 $\pm$ 0.1 \\
5 & BFI & GPT-4o-Mini & Prompt Chat & 4.5 $\pm$ 0.1 & 4.5 $\pm$ 0.1 & 4.0 $\pm$ 0.1 & 4.0 $\pm$ 0.1 & 3.7 $\pm$ 0.1 & 3.6 $\pm$ 0.1 & 4.4 $\pm$ 0.0 & 4.4 $\pm$ 0.0 & 2.7 $\pm$ 0.1 & 2.8 $\pm$ 0.1 & 3.9 $\pm$ 0.1 & 3.9 $\pm$ 0.1 \\
11 & IPIP-NEO & LLaMA3-70B & DPO & 3.6 $\pm$ 0.1 & 2.5 $\pm$ 0.1 & 3.9 $\pm$ 0.1 & 2.1 $\pm$ 0.1 & 4.1 $\pm$ 0.0 & 2.5 $\pm$ 0.1 & 4.4 $\pm$ 0.0 & 1.9 $\pm$ 0.1 & 3.1 $\pm$ 0.0 & 2.1 $\pm$ 0.1 & 3.8 $\pm$ 0.0 & 2.2 $\pm$ 0.1 