In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from datetime import datetime

# Step 1: Load datasets
label_train = pd.read_csv('633FinalData/label_train.csv')
label_test_breakfast_only = pd.read_csv('633FinalData/label_test_breakfast_only.csv')
img_train = pd.read_csv('633FinalData/img_train.csv')  # Contains image data columns
img_test = pd.read_csv('633FinalData/img_test.csv')    # Contains image data columns
demo_viome_train = pd.read_csv('633FinalData/demo_viome_train.csv')
demo_viome_test = pd.read_csv('633FinalData/demo_viome_test.csv')
cgm_train = pd.read_csv('633FinalData/cgm_train.csv')
cgm_test = pd.read_csv('633FinalData/cgm_test.csv')

# Step 2: Remove Subject ID and Day to prevent overfitting
label_train = label_train.drop(columns=["Subject ID", "Day"])
label_test_breakfast_only = label_test_breakfast_only.drop(columns=["Subject ID", "Day"])
cgm_train = cgm_train.drop(columns=["Subject ID", "Day"])
cgm_test = cgm_test.drop(columns=["Subject ID", "Day"])
demo_viome_train = demo_viome_train.drop(columns=["Subject ID"])
demo_viome_test = demo_viome_test.drop(columns=["Subject ID"])
img_train = img_train.drop(columns=["Subject ID", "Day"])
img_test = img_test.drop(columns=["Subject ID", "Day"])

# Convert time to step
def to_step(t):
    date_obj = datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
    return (date_obj.hour*60 + date_obj.minute)//5

def cgm_to_steps(cgm):
    steps = [0 for _ in range(288)]
    for t,value in cgm:
        steps[to_step(t)] = value
    return steps

def time_to_step(t1,t2):
    if t1 == '{}' or t2 == '{}':
        return [0 for _ in range(288)]
    steps = [0 for _ in range(288)]
    steps[to_step(t1)] = 1
    steps[to_step(t2)] = 1
    return steps

# Drop rows with NaT values in 'start_time' or 'end_time'
# cgm_test = cgm_test.dropna()
cgm_train = cgm_train.dropna()
# Drop rows containing the string '{}' in any column
# cgm_test = cgm_test[~cgm_test.apply(lambda row: row.astype(str).str.contains('{}').any(), axis=1)]
cgm_train = cgm_train[~cgm_train.apply(lambda row: row.astype(str).str.contains('{}').any(), axis=1)]

cgm_test['cgm_sequential'] = cgm_test['cgm'].apply(cgm_to_steps)
cgm_train['cgm_sequential'] = cgm_train['cgm'].apply(cgm_to_steps)
cgm_train['when_to_eat'] = cgm_train[['Breakfast Time', 'Lunch Time']].apply(lambda x: time_to_step(x['Breakfast Time'], x['Lunch Time']), axis=1)
cgm_test['when_to_eat'] = cgm_test[['Breakfast Time', 'Lunch Time']].apply(lambda x: time_to_step(x['Breakfast Time'], x['Lunch Time']), axis=1)

# Step 3: Convert image data from string to numeric arrays
def parse_image_data(df, column_name):
    """Parse image data stored as strings into numeric arrays."""
    parsed_data = []
    for img_str in df[column_name]:
        try:
            # Convert string representation to an actual array
            img_array = np.array(literal_eval(img_str))
            # Flatten the 3D array to 1D
            img_flat = img_array.flatten()
            parsed_data.append(img_flat)
        except Exception as e:
            print(f"Error parsing image data: {e}")
            # If parsing fails, add a zero-filled placeholder
            parsed_data.append(np.zeros((112 * 112 * 3)))  # Adjust size as needed
    return pd.DataFrame(parsed_data)

# Process train and test image data
train_images = parse_image_data(img_train, 'Image Before Lunch')  # Replace with correct column name if needed
test_images = parse_image_data(img_test, 'Image Before Lunch')    # Replace with correct column name if needed

# Step 4: Handle numerical and categorical columns in demo_viome datasets
# Identify numerical and categorical columns
numerical_columns = demo_viome_train.select_dtypes(include=np.number).columns
categorical_columns = demo_viome_train.select_dtypes(include='object').columns

# Standardize numerical data
scaler = StandardScaler()
demo_viome_train[numerical_columns] = scaler.fit_transform(demo_viome_train[numerical_columns])
demo_viome_test[numerical_columns] = scaler.transform(demo_viome_test[numerical_columns])

# Encode categorical data
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_train = encoder.fit_transform(demo_viome_train[categorical_columns])
encoded_test = encoder.transform(demo_viome_test[categorical_columns])

# Convert encoded data to DataFrame
encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(categorical_columns))
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(categorical_columns))

# Merge encoded data back with numerical data
demo_viome_train = pd.concat([demo_viome_train.reset_index(drop=True), encoded_train_df], axis=1).drop(columns=categorical_columns)
demo_viome_test = pd.concat([demo_viome_test.reset_index(drop=True), encoded_test_df], axis=1).drop(columns=categorical_columns)

# Step 5: Merge datasets without Subject ID or Day
merged_train = pd.concat([cgm_train, label_train, demo_viome_train, train_images], axis=1)
merged_test = pd.concat([cgm_test, label_test_breakfast_only, demo_viome_test, test_images], axis=1)

# Step 6: Handle missing values
merged_train = merged_train.fillna(method='ffill').fillna(method='bfill')
merged_test = merged_test.fillna(method='ffill').fillna(method='bfill')

# Step 7: Save preprocessed data
merged_train.to_csv('preprocessed_train.csv', index=False)
merged_test.to_csv('preprocessed_test.csv', index=False)

print("Preprocessed data saved as 'preprocessed_train.csv' and 'preprocessed_test.csv'")


Preprocessed data saved as 'preprocessed_train.csv' and 'preprocessed_test.csv'


In [19]:
print(img_train.columns)


Index(['Subject ID', 'Day', 'Image Before Breakfast', 'Image Before Lunch'], dtype='object')
