In [1]:
import pandas as pd
import numpy as np
from scipy.signal import convolve
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Step 1: Load datasets
label_train = pd.read_csv('633FinalData/label_train.csv')
label_test_breakfast_only = pd.read_csv('633FinalData/label_test_breakfast_only.csv')
img_train = pd.read_csv('633FinalData/img_train.csv')
img_test = pd.read_csv('633FinalData/img_test.csv')
demo_viome_train = pd.read_csv('633FinalData/demo_viome_train.csv')
demo_viome_test = pd.read_csv('633FinalData/demo_viome_test.csv')
cgm_train = pd.read_csv('633FinalData/cgm_train.csv')
cgm_test = pd.read_csv('633FinalData/cgm_test.csv')

# Step 2: Merge datasets based on Subject ID and Day
merged_train = (
    cgm_train
    .merge(label_train, on=["Subject ID", "Day"], how="inner")
    .merge(demo_viome_train, on=["Subject ID"], how="inner")
)
merged_test = (
    cgm_test
    .merge(label_test_breakfast_only, on=["Subject ID", "Day"], how="inner")
    .merge(demo_viome_test, on=["Subject ID"], how="inner")
)

# Step 3: Handle missing values
merged_train = merged_train.fillna(method='ffill').fillna(method='bfill')
merged_test = merged_test.fillna(method='ffill').fillna(method='bfill')

# Step 4: Standardize numerical data and encode categorical variables
scaler = StandardScaler()
merged_train[['Age', 'Weight', 'Height', 'BMI']] = scaler.fit_transform(
    merged_train[['Age', 'Weight', 'Height', 'BMI']]
)
merged_test[['Age', 'Weight', 'Height', 'BMI']] = scaler.transform(
    merged_test[['Age', 'Weight', 'Height', 'BMI']]
)

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # Changed `sparse` to `sparse_output`
encoded_train = encoder.fit_transform(merged_train[['Gender', 'Race']])
encoded_test = encoder.transform(merged_test[['Gender', 'Race']])

encoded_train_df = pd.DataFrame(
    encoded_train, columns=encoder.get_feature_names_out(['Gender', 'Race'])
)
encoded_test_df = pd.DataFrame(
    encoded_test, columns=encoder.get_feature_names_out(['Gender', 'Race'])
)

merged_train = pd.concat([merged_train.reset_index(drop=True), encoded_train_df], axis=1)
merged_test = pd.concat([merged_test.reset_index(drop=True), encoded_test_df], axis=1)

# Step 5: Save preprocessed data
merged_train.to_csv('preprocessed_train.csv', index=False)
merged_test.to_csv('preprocessed_test.csv', index=False)

print("Preprocessed data saved as 'preprocessed_train.csv' and 'preprocessed_test.csv'")

Preprocessed data saved as 'preprocessed_train.csv' and 'preprocessed_test.csv'
