# Preprocessing

In [28]:
import pandas as pd
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping

In [29]:
feature_file_path = '../data/raw/ecg_results.csv'
otree_file_path = '../data/raw/all_apps_wide-2024-03-13.csv'

# Read feature file into pd Dataframe.
biosignal_feature_df = pd.read_csv(
    feature_file_path,
    sep=',',
    header=0,
)

# Read experiment data into pd Dataframe.
otree_df = pd.read_csv(
   otree_file_path,
    sep=',',
    header=0,
     usecols=['participant.code', 'stockmarket.40.player.final_payout'],
)

print(otree_df.head(5))

  participant.code  stockmarket.40.player.final_payout
0         olkb84nv                               16.75
1         90x8j3by                               11.84
2         518oj41e                               13.49
3         3d8fktvx                               16.17
4         7va8p4lw                               23.22


## Biosignal Feature File Preparation
1. Drop columns that are entirely empty
2. Fill any remaining empty fields with 0

In [30]:
# Drop columns that are entirely empty
biosignal_feature_df.dropna(axis=1, how='all', inplace=True)

# Fill any remaining empty fields with 0
biosignal_feature_df.fillna(0, inplace=True)

## OTree File Preparation: Encoding class labels
TODO: Right now, the data only includes 1 session 

In [31]:
# Calculate the 80th percentile of the final payout
percentile_50 = otree_df['stockmarket.40.player.final_payout'].quantile(0.50)

# Encode the classes
otree_df['performance_class'] = otree_df['stockmarket.40.player.final_payout'].apply(lambda x: 1 if x >= percentile_50 else 0)

print(otree_df.head(5))

  participant.code  stockmarket.40.player.final_payout  performance_class
0         olkb84nv                               16.75                  1
1         90x8j3by                               11.84                  0
2         518oj41e                               13.49                  0
3         3d8fktvx                               16.17                  0
4         7va8p4lw                               23.22                  1


## Creating Labeled Feature Matrix
1. Merge the DataFrames
2. Drop the Baseline Windows, as they have no discriminatory power on trading performance, since no decision making is involved.
3. Drop the columns that are not needed for the feature matrix.

In [32]:
# Merge the DataFrames
merged_df = pd.merge(biosignal_feature_df, otree_df, left_on='Participant', right_on='participant.code')

# Drop the Baseline Windows, as they have no discriminatory power on trading performance, since no decision making is involved.
merged_df = merged_df[merged_df['Type'] != 'Baseline']

# TODO MOVE TO CONFIG FILE
columns_to_drop = ['Session', 'Group', 'Participant', 'Type', 'stockmarket.40.player.final_payout', 'participant.code']

# Drop the columns that are not needed for the feature matrix.
columns_to_drop = [col for col in columns_to_drop if col in merged_df.columns]
labeled_feature_matrix_df = merged_df.drop(columns=columns_to_drop)


## Saving the Labeled Feature Matrix

In [33]:
labeled_feature_matrix_df.to_csv("../data/processed/labeled_feature_matrix.csv", index=False)

print("Daten erfolgreich gemerged und gelabelt.")

Daten erfolgreich gemerged und gelabelt.
