In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, label_binarize, LabelEncoder
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, auc as sklearn_auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import lightgbm as lgb
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings

import statsmodels.api as sm
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)


random_state = 42


files_bfi2 = [
    "Human_GPT3.5_persona_BFI2.xlsx",
    "Human_GPT3.5_shape_BFI2.xlsx",
    "Human_GPT4_persona_BFI2.xlsx",
    "Human_GPT4_shape_BFI2.xlsx",
    'Human_LLaMA3_persona_BFI2.xlsx',
    "Human_LLaMA3_shape_BFI2.xlsx",
]

files_hexao= [
    "Human_GPT3.5_persona_HEXAO.xlsx",
    "Human_GPT3.5_shape_HEXAO.xlsx",
    "Human_GPT4_persona_HEXAO.xlsx",
    "Human_GPT4_shape_HEXAO.xlsx",
    "Human_LLaMA3_persona_HEXAO.xlsx",
    "Human_LLaMA3_shape_HEXAO.xlsx"
]

# Seperate

In [2]:
def clean(df, item_counts):
    columns_selected = [f"item{i}" for i in range(1, item_counts+1)]  # Ensure your DataFrame has these columns

    # clean errored data
    X = df[columns_selected]  # Features (34 columns)
    mask = X.apply(lambda x: x.between(1, 5)).all(axis=1)
    df = df[mask]

    # Calculate mean and standard deviation
    df['mean'] = df[columns_selected].mean(axis=1)
    df['std'] = df[columns_selected].std(axis=1)
    return df

In [3]:
def random_regression_with_kfold(df, k=5, drawing=False):
    X = df[['mean', 'std']]
    y = df["label"]

    print(X)
    print("------------")
    print(y)
    
    # Encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Initialize the logistic regression model (multinomial for multiple categories)
    model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)
    
    # K-Fold Cross Validation
    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    auc_scores = []
    confusion_matrices = []
    mean_p_values = []
    std_p_values = []
    mae_scores = []
    mse_scores = []
    r2_scores = []
    rmse_scores = []
    mean_coeffs = []
    std_coeffs = []

    for train_index, test_index in kf.split(X, y_encoded):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y_encoded[train_index], y_encoded[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, average='macro'))
        recall_scores.append(recall_score(y_test, y_pred, average='macro'))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        mse_scores.append(mean_squared_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        
        try:
            auc_scores.append(roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr', average='macro'))
        except ValueError:
            auc_scores.append(None)
        
        confusion_matrices.append(confusion_matrix(y_test, y_pred))
        
        # Significance levels of each feature within each fold
        X_with_const = sm.add_constant(X_train)
        logit_model = sm.MNLogit(y_train, X_with_const)
        result = logit_model.fit()
        p_values = result.pvalues

        mean_coeffs.append(result.params.loc['mean', 0])
        std_coeffs.append(result.params.loc['std', 0])
        mean_p_values.append(p_values.loc['mean', 0])
        std_p_values.append(p_values.loc['std', 0])
    
    # Calculate mean and std scores across all folds
    mean_accuracy = np.mean(accuracy_scores)
    std_accuracy = np.std(accuracy_scores)
    mean_precision = np.mean(precision_scores)
    std_precision = np.std(precision_scores)
    mean_recall = np.mean(recall_scores)
    std_recall = np.std(recall_scores)
    mean_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)
    mean_auc = np.mean([score for score in auc_scores if score is not None])
    std_auc = np.std([score for score in auc_scores if score is not None])
    mean_p_value_mean = np.mean(mean_p_values)
    std_p_value_mean = np.std(mean_p_values)
    mean_p_value_std = np.mean(std_p_values)
    std_p_value_std = np.std(std_p_values)
    mean_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)
    mean_mse = np.mean(mse_scores)
    std_mse = np.std(mse_scores)
    mean_r2 = np.mean(r2_scores)
    std_r2 = np.std(r2_scores)
    mean_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    mean_coeff_mean = np.mean(mean_coeffs)
    std_coeff_mean = np.std(mean_coeffs)
    mean_coeff_std = np.mean(std_coeffs)
    std_coeff_std = np.std(std_coeffs)
    
    if drawing:
        plt.figure(figsize=(8, 6))
        sns.heatmap(np.mean(confusion_matrices, axis=0), annot=True, fmt='.2f', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.show()
    
    return {
        "Mean Accuracy": mean_accuracy,
        "Std Accuracy": std_accuracy,
        "Mean Precision": mean_precision,
        "Std Precision": std_precision,
        "Mean Recall": mean_recall,
        "Std Recall": std_recall,
        "Mean F1 Score": mean_f1,
        "Std F1 Score": std_f1,
        "Mean AUC": mean_auc,
        "Std AUC": std_auc,
        "Confusion Matrices": confusion_matrices,
        "Mean P-Value (mean)": mean_p_value_mean,
        "Std P-Value (mean)": std_p_value_mean,
        "Mean P-Value (std)": mean_p_value_std,
        "Std P-Value (std)": std_p_value_std,
        "Mean Absolute Error (MAE)": mean_mae,
        "Std Absolute Error (MAE)": std_mae,
        "Mean Squared Error (MSE)": mean_mse,
        "Std Squared Error (MSE)": std_mse,
        "R-squared (R²) Score": mean_r2,
        "Std R-squared (R²) Score": std_r2,
        "Root Mean Squared Error (RMSE)": mean_rmse,
        "Std Root Mean Squared Error (RMSE)": std_rmse,
        "Mean Coefficient (mean)": mean_coeff_mean,
        "Std Coefficient (mean)": std_coeff_mean,
        "Mean Coefficient (std)": mean_coeff_std,
        "Std Coefficient (std)": std_coeff_std,
    }



def get_n_rounds(files, items_count, n=1):
    all_results = []

    for _ in range(n):
        round_results = []
        for file in files:
            df = pd.read_excel(f'LLM_agent_data classification/{file}')
            df = clean(df, items_count)

            df_lm = df[df["label"] == "LM"].reset_index(drop=True)
            df_human = df[df["label"] == "Human"].reset_index(drop=True)
            df_human = df_human.sample(1200).reset_index(drop=True)
            
            df = pd.concat([df_lm, df_human], axis=0)
            
            res = random_regression_with_kfold(df, k=5, drawing=False)
            res["source"] = file
            round_results.append(res)
        all_results.append(pd.DataFrame(round_results))
    
    # Calculate average of each metric over all rounds
     # Concatenate all results and group by 'source' to calculate mean and std of numeric columns
    all_results_df = pd.concat(all_results)
    numeric_cols = all_results_df.select_dtypes(include=np.number).columns
    averaged_results = all_results_df.groupby('source')[numeric_cols].mean().reset_index()

    return averaged_results



In [None]:
df = pd.read_excel(f'LLM_agent_data classification/{files_bfi2}')

## bfi2

In [4]:
df_1 = get_n_rounds(files_bfi2, 60)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mean'] = df[columns_selected].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['std'] = df[columns_selected].std(axis=1)


Optimization terminated successfully.
         Current function value: 0.210136
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.207426
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.203550
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.219881
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.218015
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.369707
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.391802
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.391969
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.383781
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.387834
  

In [40]:
df_1[["source", 'Mean Accuracy', 'Std Accuracy', 'Mean Precision', 'Std Precision',
       'Mean Recall', 'Std Recall', 'Mean F1 Score', 'Std F1 Score', 
      "Mean Coefficient (mean)", "Std Coefficient (mean)", "Mean Coefficient (std)", "Std Coefficient (std)"]]

Unnamed: 0,source,Mean Accuracy,Std Accuracy,Mean Precision,Std Precision,Mean Recall,Std Recall,Mean F1 Score,Std F1 Score,Mean Coefficient (mean),Std Coefficient (mean),Mean Coefficient (std),Std Coefficient (std)
0,Human_GPT3.5_persona_BFI2.xlsx,0.911485,0.019626,0.911933,0.019446,0.911525,0.019583,0.911464,0.019634,-1.640997,0.363881,-13.415647,0.737704
1,Human_GPT3.5_shape_BFI2.xlsx,0.76,0.033082,0.763208,0.031516,0.76,0.033082,0.759146,0.033691,-2.413083,0.108172,-4.384266,0.104573
2,Human_GPT4_persona_BFI2.xlsx,0.975,0.010541,0.975131,0.010504,0.975,0.010541,0.974998,0.010542,1.674304,0.587874,-16.735516,1.066957
3,Human_GPT4_shape_BFI2.xlsx,0.715,0.051208,0.716765,0.052223,0.715,0.051208,0.714508,0.051095,-1.278142,0.09345,-3.103278,0.35312
4,Human_LLaMA3_persona_BFI2.xlsx,0.651667,0.006236,0.65284,0.006837,0.651667,0.006236,0.651015,0.006035,-0.070938,0.116122,-2.91159,0.128075
5,Human_LLaMA3_shape_BFI2.xlsx,0.654916,0.019091,0.666708,0.026717,0.654322,0.019031,0.648911,0.018866,-1.760865,0.043378,-0.130444,0.085717


## xehao

In [4]:
df_2 = get_n_rounds(files_hexao, 100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mean'] = df[columns_selected].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['std'] = df[columns_selected].std(axis=1)


Optimization terminated successfully.
         Current function value: 0.271257
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.252553
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.275611
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.262990
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.267106
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.456366
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.458823
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.458468
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.453632
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.458863
  

In [5]:
df_2[["source", 'Mean Accuracy', 'Std Accuracy', 'Mean Precision', 'Std Precision',
       'Mean Recall', 'Std Recall', 'Mean F1 Score', 'Std F1 Score', 
      "Mean Coefficient (mean)", "Std Coefficient (mean)", "Mean Coefficient (std)", "Std Coefficient (std)"]]

Unnamed: 0,source,Mean Accuracy,Std Accuracy,Mean Precision,Std Precision,Mean Recall,Std Recall,Mean F1 Score,Std F1 Score,Mean Coefficient (mean),Std Coefficient (mean),Mean Coefficient (std),Std Coefficient (std)
0,Human_GPT3.5_persona_HEXAO.xlsx,0.889839,0.017979,0.85795,0.039449,0.773531,0.026985,0.805373,0.031423,-2.021587,0.176574,-12.918661,0.486444
1,Human_GPT3.5_shape_HEXAO.xlsx,0.824667,0.006182,0.859223,0.026655,0.569167,0.019428,0.572177,0.033174,-1.993628,0.10865,0.654693,0.260658
2,Human_GPT4_persona_HEXAO.xlsx,0.976,0.00611,0.978515,0.009571,0.94625,0.014031,0.961145,0.010127,-4.700856,0.527688,-20.830636,0.993728
3,Human_GPT4_shape_HEXAO.xlsx,0.860667,0.003266,0.925839,0.001485,0.651667,0.008165,0.692565,0.010379,-2.438258,0.091873,-1.877683,0.203642
4,Human_LLaMA3_persona_HEXAO.xlsx,0.798667,0.001633,0.533804,0.113573,0.502917,0.003385,0.453534,0.007863,0.407193,0.144745,2.569409,0.194633
5,Human_LLaMA3_shape_HEXAO.xlsx,0.854,0.009286,0.88967,0.021542,0.645,0.029125,0.679987,0.038104,-1.617556,0.030996,2.210846,0.307047


# Combined

In [6]:
def read_clean_llm(files, item_counts):
    columns_selected = [f"item{i}" for i in range(1, item_counts+1)]  

    
    df = pd.read_excel(f'LLM_agent_data classification/{files[0]}')
    df = df[df["label"] == "LM"].reset_index(drop=True)
    X = df[columns_selected]  # Features (34 columns)
    mask = X.apply(lambda x: x.between(1, 5)).all(axis=1)
    df = df[mask]
    df["source"] = files[0]

    for i in range(1, len(files)):
        dfi = pd.read_excel(f'LLM_agent_data classification/{files[i]}')
        dfi = dfi[dfi["label"] == "LM"].reset_index(drop=True)
        X = dfi[columns_selected]  # Features (34 columns)
        mask = X.apply(lambda x: x.between(1, 5)).all(axis=1)
        dfi = dfi[mask]
        dfi["source"] = files[i]

        df = pd.concat([df, dfi], axis=0)
        

    # Calculate mean and standard deviation
    df['mean'] = df[columns_selected].mean(axis=1)
    df['std'] = df[columns_selected].std(axis=1)
    return df

def combine_all_lm_human(files, items_count):
    columns_selected = [f"item{i}" for i in range(1, items_count+1)]
    
    df_lm = read_clean_llm(files, items_count)
    
    df_human = pd.read_excel(f'LLM_agent_data classification/{files[0]}')
    df_human = df_human[df_human["label"] == "Human"].reset_index(drop=True)
    df_human["source"] = "Human"
    df_human['mean'] = df_human[columns_selected].mean(axis=1)
    df_human['std'] = df_human[columns_selected].std(axis=1)

    return pd.concat([df_lm, df_human], axis=0)


def pick_sample(df, human_counts, lm_counts):
    df_human = df[df["label"] == "Human"].reset_index(drop=True)
    df_human = df_human.sample(human_counts).reset_index(drop=True)

    df_lm = df[df["label"] == "LM"].reset_index(drop=True)
    df_lm = df_lm.sample(lm_counts).reset_index(drop=True)
    return pd.concat([df_lm, df_human], axis=0)

In [7]:
def get_n_rounds_combined(df, human_counts, lm_counts, items_count, n=1):
    all_results = []

    for _ in range(n):
        df = pick_sample(df, human_counts=human_counts, lm_counts=lm_counts)
        
        res = random_regression_with_kfold(df, k=5, drawing=False)
        res["source"] = "Combined"

        all_results.append(pd.DataFrame(res))
    
    # Calculate average of each metric over all rounds
    # Concatenate all results and group by 'source' to calculate mean and std of numeric columns
    all_results_df = pd.concat(all_results)
    numeric_cols = all_results_df.select_dtypes(include=np.number).columns
    averaged_results = all_results_df.groupby('source')[numeric_cols].mean().reset_index()

    return averaged_results

## BFI

In [11]:
df = combine_all_lm_human(files_bfi2, 60)
df["label"].value_counts()

label
LM       1796
Human    1559
Name: count, dtype: int64

In [58]:
df_1_c = get_n_rounds_combined(df, human_counts=1500, lm_counts=1500, items_count=60, n=1)

Optimization terminated successfully.
         Current function value: 0.520813
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.518921
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.521462
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.510163
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.530156
         Iterations 6


In [60]:
df_1_c[["source", 'Mean Accuracy', 'Std Accuracy', 'Mean Precision', 'Std Precision',
       'Mean Recall', 'Std Recall', 'Mean F1 Score', 'Std F1 Score', 
      "Mean Coefficient (mean)", "Std Coefficient (mean)", "Mean Coefficient (std)", "Std Coefficient (std)"]]

Unnamed: 0,source,Mean Accuracy,Std Accuracy,Mean Precision,Std Precision,Mean Recall,Std Recall,Mean F1 Score,Std F1 Score,Mean Coefficient (mean),Std Coefficient (mean),Mean Coefficient (std),Std Coefficient (std)
0,Combined,0.751667,0.015384,0.752551,0.015217,0.751667,0.015384,0.751443,0.015449,-1.567396,0.082447,-3.584772,0.099848


## XEHAO

In [8]:
df = combine_all_lm_human(files_hexao, 100)
df["label"].value_counts()

label
Human    7204
LM       1798
Name: count, dtype: int64

In [9]:
df_2_c = get_n_rounds_combined(df, human_counts=1798, lm_counts=1798, items_count=100, n=50)

Optimization terminated successfully.
         Current function value: 0.623427
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.630166
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.626824
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.622576
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.631530
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.628159
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.626050
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.631113
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.625963
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.623566
  

In [10]:
df_2_c[["source", 'Mean Accuracy', 'Std Accuracy', 'Mean Precision', 'Std Precision',
       'Mean Recall', 'Std Recall', 'Mean F1 Score', 'Std F1 Score', 
      "Mean Coefficient (mean)", "Std Coefficient (mean)", "Mean Coefficient (std)", "Std Coefficient (std)"]]

Unnamed: 0,source,Mean Accuracy,Std Accuracy,Mean Precision,Std Precision,Mean Recall,Std Recall,Mean F1 Score,Std F1 Score,Mean Coefficient (mean),Std Coefficient (mean),Mean Coefficient (std),Std Coefficient (std)
0,Combined,0.650011,0.014565,0.651454,0.01454,0.650012,0.014567,0.649165,0.01474,-1.644881,0.045085,-1.400397,0.051092
