# Preprocessing

In [8]:
import os
import re
import pandas as pd
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping

## Merging Files From All Sessions

In [9]:
root_dir = "../data/raw"  # Change this to your actual root folder path
feature_file_output_name = "../data/raw/merged_features.csv"
otree_file_output_name = "../data/raw/merged_otree.csv"

# List to store DataFrames
feature_df_list = []
otree_df_list = []

countDatasets = 0
countOtree = 0

for folder in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder)
    
    if os.path.isdir(folder_path):
        
        feature_csv_files = [f for f in os.listdir(folder_path) if re.match(r'feature_dataset_normalized_.*\.csv', f)]
        otree_csv_files = [f for f in os.listdir(folder_path) if re.match(r'all_apps_wide-.*\.csv', f)]
        
        if feature_csv_files:
            countDatasets += 1
            file_path = os.path.join(folder_path, feature_csv_files[0]) 
            feature_df = pd.read_csv(file_path)
            feature_df_list.append(feature_df)
            
        if otree_csv_files:
            countOtree += 1
            file_path = os.path.join(folder_path, otree_csv_files[0]) 
            otree_df = pd.read_csv(file_path)
            otree_df_list.append(otree_df)

# Concatenate all DataFrames into one
if feature_df_list:
    merged_df = pd.concat(feature_df_list, ignore_index=True)
    merged_df.to_csv(feature_file_output_name, index=False)
    print(f"Merged CSV saved as {feature_file_output_name}")
    print(merged_df.shape)
    
if otree_df_list:
    merged_otree_df = pd.concat(otree_df_list, ignore_index=True)
    merged_otree_df.to_csv(otree_file_output_name, index=False)
    print(f"Merged CSV saved as {otree_file_output_name}")
    print(merged_otree_df.shape)
    
print(f"Found {countDatasets} datasets.")
print(f"Found {countOtree} otree files.")

Merged CSV saved as ../data/raw/merged_features.csv
(484, 43)
Merged CSV saved as ../data/raw/merged_otree.csv
(139, 2592)
Found 19 datasets.
Found 19 otree files.


In [10]:
feature_file_path = "../data/raw/merged_features.csv"
otree_file_path = "../data/raw/merged_otree.csv"

# Read feature file into pd Dataframe.
biosignal_feature_df = pd.read_csv(
    feature_file_path,
    sep=',',
    header=0,
)

# Read experiment data into pd Dataframe.
otree_df = pd.read_csv(
   otree_file_path,
    sep=',',
    header=0,
     usecols=['participant.code', 'session.code', 'stockmarket.40.player.final_payout'],
)

print(otree_df.head(15))

   participant.code session.code  stockmarket.40.player.final_payout
0          m32wpcjo     sci2ssd8                               21.47
1          0k5o6enp     sci2ssd8                               22.70
2          mvbdccq8     sci2ssd8                               15.34
3          oobhu2bo     sci2ssd8                               24.52
4          07cwkx8l     sci2ssd8                               15.96
5          sjpiquh9     f5jvcr8u                               11.35
6          dr2o15w6     f5jvcr8u                               18.92
7          kbrqbfei     f5jvcr8u                               26.30
8          ai4e8vrz     f5jvcr8u                               18.82
9          ko878gxe     f5jvcr8u                               26.58
10         xu6yit19     f5jvcr8u                               14.71
11         lf5u5r5u     f5jvcr8u                                9.78
12         fjjzr5y0     f5jvcr8u                               23.55
13         9iflb9ri     2qj6wa7h  

## Biosignal Feature File Preparation
1. Drop columns that are entirely empty
2. Fill any remaining empty fields with 0

In [11]:
# Drop columns that are entirely empty
biosignal_feature_df.dropna(axis=1, how='all', inplace=True)

# Fill any remaining empty fields with 0
biosignal_feature_df.fillna(0, inplace=True)

## OTree File Preparation: Encoding class labels
TODO: Right now, the data only includes 1 session 

In [12]:
# Group by "session.code" and apply the function, but exclude "session.code" from modification
otree_df["performance_class"] = (
    otree_df.groupby("session.code")["stockmarket.40.player.final_payout"]
    .transform(lambda x: (x >= x.quantile(0.80)).astype(int))
)

print(otree_df.head(15))

   participant.code session.code  stockmarket.40.player.final_payout  performance_class
0          m32wpcjo     sci2ssd8                               21.47                  0
1          0k5o6enp     sci2ssd8                               22.70                  0
2          mvbdccq8     sci2ssd8                               15.34                  0
3          oobhu2bo     sci2ssd8                               24.52                  1
4          07cwkx8l     sci2ssd8                               15.96                  0
5          sjpiquh9     f5jvcr8u                               11.35                  0
6          dr2o15w6     f5jvcr8u                               18.92                  0
7          kbrqbfei     f5jvcr8u                               26.30                  1
8          ai4e8vrz     f5jvcr8u                               18.82                  0
9          ko878gxe     f5jvcr8u                               26.58                  1
10         xu6yit19     f5jvcr8u

## Creating Labeled Feature Matrix
1. Merge the DataFrames
2. Drop the Baseline Windows, as they have no discriminatory power on trading performance, since no decision making is involved.
3. Drop the columns that are not needed for the feature matrix.

In [13]:
# Merge the DataFrames
merged_df = pd.merge(biosignal_feature_df, otree_df, left_on='Participant_Code', right_on='participant.code')

# Drop the Baseline Windows, as they have no discriminatory power on trading performance, since no decision making is involved.
merged_df = merged_df[merged_df['Segment_Type'] != 'Baseline']

# TODO MOVE TO CONFIG FILE
columns_to_drop = ['session.code', 'Window_ID', 'Window_Start', 'Window_End', 'Window_Start_readable', 'Window_End_readable', 'Participant_Code', 'Segment_Type', 'stockmarket.40.player.final_payout', 'participant.code']

# Drop the columns that are not needed for the feature matrix.
columns_to_drop = [col for col in columns_to_drop if col in merged_df.columns]
labeled_feature_matrix_df = merged_df.drop(columns=columns_to_drop)


## Saving the Labeled Feature Matrix

In [14]:
labeled_feature_matrix_df.to_csv("../data/processed/labeled_feature_matrix.csv", index=False)

print("Daten erfolgreich gemerged und gelabelt.")
print(labeled_feature_matrix_df.shape)

Daten erfolgreich gemerged und gelabelt.
(401, 37)
