# Preprocessing
This notebook merges the feature files and the otree files from all sessions into a single labeled feature matrix.
The top 20% best performing participants are labeled as 1, the rest as 0, forming a binary classification problem.

In [None]:
import os
import re
import pandas as pd
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping

## Merging Files From All Sessions

In [None]:
root_dir = "../data/raw"
feature_file_output_name = "../data/raw/merged_features.csv"
otree_file_output_name = "../data/raw/merged_otree.csv"

# List to store DataFrames
feature_df_list = []
otree_df_list = []

countDatasets = 0
countOtree = 0

for folder in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder)
    
    if os.path.isdir(folder_path):
        
        feature_csv_files = [f for f in os.listdir(folder_path) if re.match(r'feature_dataset_normalized_.*\.csv', f)]
        otree_csv_files = [f for f in os.listdir(folder_path) if re.match(r'all_apps_wide-.*\.csv', f)]
        
        if feature_csv_files:
            countDatasets += 1
            file_path = os.path.join(folder_path, feature_csv_files[0]) 
            feature_df = pd.read_csv(file_path)
            feature_df_list.append(feature_df)
            
        if otree_csv_files:
            countOtree += 1
            file_path = os.path.join(folder_path, otree_csv_files[0]) 
            otree_df = pd.read_csv(file_path)
            otree_df_list.append(otree_df)

# Concatenate all DataFrames into one
if feature_df_list:
    merged_df = pd.concat(feature_df_list, ignore_index=True)
    merged_df.to_csv(feature_file_output_name, index=False)
    print(f"Merged CSV saved as {feature_file_output_name}")
    print(merged_df.shape)
    
if otree_df_list:
    merged_otree_df = pd.concat(otree_df_list, ignore_index=True)
    merged_otree_df.to_csv(otree_file_output_name, index=False)
    print(f"Merged CSV saved as {otree_file_output_name}")
    print(merged_otree_df.shape)
    
print(f"Found {countDatasets} datasets.")
print(f"Found {countOtree} otree files.")

## Read the Merged Files into DataFrames and extract the relevant columns

In [None]:
feature_file_path = "../data/raw/merged_features.csv"
otree_file_path = "../data/raw/merged_otree.csv"

# Read feature file into pd Dataframe.
biosignal_feature_df = pd.read_csv(
    feature_file_path,
    sep=',',
    header=0,
)

# Read experiment data into pd Dataframe.
otree_df = pd.read_csv(
   otree_file_path,
    sep=',',
    header=0,
     usecols=['participant.code', 'session.code', 'stockmarket.40.player.final_payout'],
)

print(otree_df.head(15))

## Biosignal Feature File Preparation
1. Drop columns that are entirely empty
2. Fill any remaining empty fields with 0

In [None]:
# Drop columns that are entirely empty
biosignal_feature_df.dropna(axis=1, how='all', inplace=True)

# Fill any remaining empty fields with 0
biosignal_feature_df.fillna(0, inplace=True)

## OTree File Preparation: Encoding class labels
The top 20% besst perfroming participants are labeled as 1, the rest as 0.

In [None]:
otree_df["performance_class"] = (
    otree_df.groupby("session.code")["stockmarket.40.player.final_payout"]
    .transform(lambda x: (x >= x.quantile(0.80)).astype(int))
)

print(otree_df.head(15))

## Creating Labeled Feature Matrix
1. Merge the DataFrames
2. Drop the Baseline Windows, as they have no discriminatory power on trading performance, since no decision making is involved.
3. Drop the columns that are not needed for the feature matrix.

In [None]:
# Merge the DataFrames and drop the Baseline Windows.
merged_df = pd.merge(biosignal_feature_df, otree_df, left_on='Participant_Code', right_on='participant.code')

columns_to_drop = ['session.code', 'Window_ID', 'Window_Start', 'Window_End', 'Window_Start_readable', 'Window_End_readable', 'Participant_Code', 'Segment_Type', 'stockmarket.40.player.final_payout', 'participant.code']

# Drop the columns that are not needed for the feature matrix.
columns_to_drop = [col for col in columns_to_drop if col in merged_df.columns]
labeled_feature_matrix_df = merged_df.drop(columns=columns_to_drop)


## Saving the Labeled Feature Matrix

In [None]:
labeled_feature_matrix_df.to_csv("../data/processed/labeled_feature_matrix.csv", index=False)

print("Data merged and labeled successfully.")
print(labeled_feature_matrix_df.shape)