In [20]:
import pandas as pd
import numpy as np
import torchvision.transforms as transforms
from PIL import Image
from io import BytesIO
import ast
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
from torch.utils.data import DataLoader, Dataset

In [21]:
img_train = pd.read_csv('633FinalData/img_train.csv')
cgm_train = pd.read_csv('633FinalData/cgm_train.csv')
demo_viome_train = pd.read_csv('633FinalData/demo_viome_train.csv')
label_train = pd.read_csv('633FinalData/label_train.csv')
img_test = pd.read_csv('633FinalData/img_test.csv')
cgm_test = pd.read_csv('633FinalData/cgm_test.csv')
demo_viome_test= pd.read_csv('633FinalData/demo_viome_test.csv')
label_test = pd.read_csv('633FinalData/label_test_breakfast_only.csv')

In [22]:
def convert_image(image_str):
    image_data = ast.literal_eval(image_str)
    image_array = np.array(image_data, dtype=np.uint8)
    return image_array.astype(np.float32) / 255.0


In [23]:
def convert_str(image_str):
    image_data = ast.literal_eval(image_str)
    image_array = np.array(image_data, dtype=np.float64)
    return image_array

In [24]:
def data_preprocess(img_train,cgm_train,label_train,demo_viome_train,train=0):
    img_train = img_train.drop('Subject ID', axis=1)
    cgm_train = cgm_train.drop('Subject ID', axis=1)
    label_train = label_train.drop('Subject ID', axis=1)
    demo_viome_train = demo_viome_train.drop('Subject ID', axis=1)
    img_train = img_train.drop('Day', axis=1)
    cgm_train = cgm_train.drop('Day', axis=1)
    label_train = label_train.drop('Day', axis=1)
    
    repeated_demo_viome_train = demo_viome_train.loc[demo_viome_train.index.repeat(9)].reset_index(drop=True)
    
    
    combined_data = pd.concat([img_train, cgm_train, label_train,repeated_demo_viome_train], axis=1)
    combined_data = combined_data.dropna()
    
    idx = []
    for i in range (combined_data.shape[0]):
        for j in range (combined_data.shape[1]):
            cell = combined_data.iloc[i, j]
            if isinstance(cell, str) and len(cell) == 2:  # Checking string length
                idx.append(i)
    
    combined_data.drop(idx, inplace=True)
    
    combined_data['Breakfast Time'] = pd.to_datetime(combined_data['Breakfast Time'])
    combined_data['Lunch Time'] = pd.to_datetime(combined_data['Lunch Time'])
    
    combined_data['Breakfast minute'] = combined_data['Breakfast Time'].dt.hour*60+combined_data['Breakfast Time'].dt.minute
    combined_data['Lunch minute'] = combined_data['Lunch Time'].dt.hour*60+combined_data['Lunch Time'].dt.minute
    
    combined_data['cgm_numbers'] = combined_data.iloc[:, 4].apply(lambda x: [float(num) for num in re.findall(r",\s([\d\.]+)\)", x)])
    
    combined_data['Race'] = pd.Categorical(combined_data['Race'], categories=['Hispanic/Latino', 'White', 'Other'])
    
    # If needed, convert the categories into numerical codes
    combined_data['Race_Categorical'] = combined_data['Race'].cat.codes
    
    
    combined_data = combined_data.drop(['Breakfast Time','Lunch Time','CGM Data','Race'], axis=1)
    
    # Print all column names
    print(combined_data.columns.tolist())
    
    
    
    combined_data['Viome'] = combined_data['Viome'].apply(convert_str)
    
    img_set = combined_data[['Image Before Breakfast', 'Image Before Lunch']]
    rest = combined_data.drop(columns=['Image Before Breakfast', 'Image Before Lunch'])
    
    if train:
        label = combined_data[['Lunch Calories','Lunch Protein','Lunch Carbs','Lunch Fat']]
        rest = rest.drop(columns=['Lunch Calories','Lunch Protein','Lunch Carbs','Lunch Fat'])
    
    catagorical = combined_data[['Gender','Diabetes Status','Race_Categorical']]
    rest = rest.drop(columns=['Gender','Diabetes Status','Race_Categorical'])
    
    time_set  = combined_data[['cgm_numbers','Viome']]
    continues = rest.drop(columns=['cgm_numbers','Viome'])
    variable_sequence_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in time_set['cgm_numbers']]
    fixed_sequence_tensors = torch.tensor(time_set['Viome'].tolist(), dtype=torch.float32)  # Already uniform length
    
    # Pad the variable-length sequences
    padded_variable_sequences = pad_sequence(variable_sequence_tensors, batch_first=True, padding_value=0)
    

    

    
    
    img_set['Image Before Breakfast'] = img_set['Image Before Breakfast'].apply(convert_image)
    img_set['Image Before Lunch'] = img_set['Image Before Lunch'].apply(convert_image)
    
    # Convert numpy arrays into tensors and stack them
    img_tensors_breakfast = torch.stack([torch.tensor(img) for img in img_set['Image Before Breakfast']])
    img_tensors_lunch = torch.stack([torch.tensor(img) for img in img_set['Image Before Lunch']])
    
    
    
    scaler = MinMaxScaler()  # Or StandardScaler for standardization
    X_train_scaled = scaler.fit_transform(continues)

    
    if train:
        # Ensure labels are numeric and then convert to tensor
        label = label.apply(pd.to_numeric, errors='coerce')  # Convert to numeric, coercing errors
        label_tensor = torch.tensor(label.values, dtype=torch.float32)
    
    # Ensure categorical data is numeric and then convert to tensor
    catagorical = catagorical.apply(pd.to_numeric, errors='coerce')
    catagorical_tensor = torch.tensor(catagorical.values, dtype=torch.float32)

    if train:
        return img_tensors_breakfast,img_tensors_lunch, label_tensor, catagorical_tensor, padded_variable_sequences, fixed_sequence_tensors
    else:
        return img_tensors_breakfast,img_tensors_lunch, catagorical_tensor, padded_variable_sequences, fixed_sequence_tensors


In [25]:
img_breakfast_train,img_lunch_train, label_train,catagorical_train,padded_variable_sequences_train,fixed_sequence_tensors_train = data_preprocess(img_train,cgm_train,label_train,demo_viome_train,train=1)
img_breakfast_test,img_lunch_test, catagorical_test,padded_variable_sequences_test,fixed_sequence_tensors_test = data_preprocess(img_test,cgm_test,label_test,demo_viome_test,train=0)

['Image Before Breakfast', 'Image Before Lunch', 'Breakfast Calories', 'Lunch Calories', 'Breakfast Carbs', 'Lunch Carbs', 'Breakfast Fat', 'Lunch Fat', 'Breakfast Protein', 'Lunch Protein', 'Age', 'Gender', 'Weight', 'Height', 'Diabetes Status', 'A1C', 'Baseline Fasting Glucose', 'Insulin', 'Triglycerides', 'Cholesterol', 'HDL', 'Non-HDL', 'LDL', 'VLDL', 'CHO/HDL Ratio', 'HOMA-IR', 'BMI', 'Viome', 'Breakfast minute', 'Lunch minute', 'cgm_numbers', 'Race_Categorical']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  img_set['Image Before Breakfast'] = img_set['Image Before Breakfast'].apply(convert_image)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  img_set['Image Before Lunch'] = img_set['Image Before Lunch'].apply(convert_image)


['Image Before Breakfast', 'Image Before Lunch', 'Breakfast Calories', 'Breakfast Carbs', 'Breakfast Fat', 'Breakfast Protein', 'Age', 'Gender', 'Weight', 'Height', 'Diabetes Status', 'A1C', 'Baseline Fasting Glucose', 'Insulin', 'Triglycerides', 'Cholesterol', 'HDL', 'Non-HDL', 'LDL', 'VLDL', 'CHO/HDL Ratio', 'HOMA-IR', 'BMI', 'Viome', 'Breakfast minute', 'Lunch minute', 'cgm_numbers', 'Race_Categorical']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  img_set['Image Before Breakfast'] = img_set['Image Before Breakfast'].apply(convert_image)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  img_set['Image Before Lunch'] = img_set['Image Before Lunch'].apply(convert_image)


In [26]:
class CustomTrainDataset(Dataset):
    def __init__(self, img_breakfast,img_lunch, labels, categoricals, sequences, fixed_sequences):
        self.img_breakfast = img_breakfast
        self.img_lunch = img_breakfast
        self.labels = labels
        self.categoricals = categoricals
        self.sequences = sequences
        self.fixed_sequences = fixed_sequences

    def __len__(self):
        # Assuming all tensors have the same first dimension size
        return len(self.img_breakfast)

    def __getitem__(self, idx):
        # Fetch each tensor's slice at the given index
        return {
            'breakfast_images': self.img_breakfast[idx],
            'lunch_images': self.img_lunch[idx],
            'labels': self.labels[idx],
            'categoricals': self.categoricals[idx],
            'sequences': self.sequences[idx],
            'fixed_sequences': self.fixed_sequences[idx]
        }

In [27]:
class CustomTestDataset(Dataset):
    def __init__(self, img_breakfast,img_lunch, categoricals, sequences, fixed_sequences):
        self.img_breakfast = img_breakfast
        self.img_lunch = img_breakfast
        self.categoricals = categoricals
        self.sequences = sequences
        self.fixed_sequences = fixed_sequences

    def __len__(self):
        # Assuming all tensors have the same first dimension size
        return len(self.img_breakfast)

    def __getitem__(self, idx):
        # Fetch each tensor's slice at the given index
        return {
            'breakfast_images': self.img_breakfast[idx],
            'lunch_images': self.img_lunch[idx],
            'categoricals': self.categoricals[idx],
            'sequences': self.sequences[idx],
            'fixed_sequences': self.fixed_sequences[idx]
        }

In [28]:
# Assume tensors have already been defined as img_tensors, label_tensor, etc.
train_dataset = CustomTrainDataset(img_breakfast_train,img_lunch_train, label_train,catagorical_train,padded_variable_sequences_train,fixed_sequence_tensors_train)

# Define DataLoader with batch size, shuffling, etc.
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)


test_dataset = CustomTestDataset(img_breakfast_test,img_lunch_test, catagorical_train,padded_variable_sequences_train,fixed_sequence_tensors_train)

# Define DataLoader with batch size, shuffling, etc.
test_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)