# Import Libraries

In [1]:
import pandas as pd
import pandas as pd
import re
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Import Interview Results

In [None]:
identification_results = pd.read_csv("../data/market-signals-finfluencer/profile_metadata_postinterview_identification.csv")
print(identification_results.shape)
identification_results.head()

# Preprocess Interview Data

In [None]:
def extract_llm_responses(text):
    # Split the text by double newlines to separate different questions
    questions_blocks = text.split('\n\n')
    
    # Initialize lists to store the extracted data
    questions_list = []
    explanations_list = []
    symbols_list = []
    categories_list = []
    speculations_list = []
    values_list = []

    # Define regex patterns for each field
    question_pattern = r'\*\*question: (.*?)\*\*'
    explanation_pattern = r'\*\*explanation: (.*?)\*\*'
    symbol_pattern = r'\*\*symbol: (.*?)\*\*'
    category_pattern = r'\*\*category: (.*?)\*\*'
    speculation_pattern = r'\*\*speculation: (.*?)\*\*'
    value_pattern = r'\*\*value: (.*?)\*\*'

    # Iterate through each question block and extract the fields
    for block in questions_blocks:
        question = re.search(question_pattern, block, re.DOTALL)
        explanation = re.search(explanation_pattern, block, re.DOTALL)
        symbol = re.search(symbol_pattern, block, re.DOTALL)
        category = re.search(category_pattern, block, re.DOTALL)
        speculation = re.search(speculation_pattern, block, re.DOTALL)
        value = re.search(value_pattern, block, re.DOTALL)

        questions_list.append(question.group(1) if question else None)
        explanations_list.append(explanation.group(1) if explanation else None)
        symbols_list.append(symbol.group(1) if symbol else None)
        categories_list.append(category.group(1) if category else None)
        speculations_list.append(speculation.group(1) if speculation else None)
        values_list.append(value.group(1) if value else None)

    # Create a DataFrame
    data = {
        'question': questions_list,
        'explanation': explanations_list,
        'symbol': symbols_list,
        'category': categories_list,
        'speculation': speculations_list,
        'value': values_list
    }
    df = pd.DataFrame(data)

    # Flatten the DataFrame into a single Series
    flattened_series = pd.Series()
    for index, row in df.iterrows():
        question_prefix = row['question']
        if row['explanation']:
            flattened_series[f'{question_prefix} - explanation'] = row['explanation']
        if row['symbol']:
            flattened_series[f'{question_prefix} - symbol'] = row['symbol']
        if row['category']:
            flattened_series[f'{question_prefix} - category'] = row['category']
        if row['speculation']:
            flattened_series[f'{question_prefix} - speculation'] = row['speculation']
        if row['value']:
            flattened_series[f'{question_prefix} - value'] = row['value']

    return flattened_series


extracted_results = identification_results["identification_llm_response"].apply(extract_llm_responses)
identification_results = pd.concat([identification_results, extracted_results], axis=1)
identification_results.head()

# Perform Identification Evaluation

In [None]:
def calculate_metrics(df):
    # Convert "Is this a finfluencer? - category" column to binary values
    df['Is this a finfluencer? - category'] = df['Is this a finfluencer? - category'].map({'Yes': 1, 'No': 0})
    
    # Ensure there are no NaN values in the columns
    df = df.dropna(subset=['finfluencer', 'Is this a finfluencer? - category'])
    
    # Extract the true labels and predicted labels
    y_true = df['finfluencer']
    y_pred = df['Is this a finfluencer? - category']
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate macro-averaged F1 score
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    
    # Calculate AUC score
    auc = roc_auc_score(y_true, y_pred)
    
    return accuracy, macro_f1, auc

accuracy, macro_f1, auc = calculate_metrics(identification_results)
print(f'Accuracy: {accuracy}')
print(f'Macro-averaged F1 score: {macro_f1}')
print(f'AUC score: {auc}')

# Save Formatted Interview Results

In [5]:
identification_results.to_csv("../data/market-signals-finfluencer/profile_metadata_postinterview_identification_processed.csv", index=False)