In [232]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim

## Importing data

### Structuring Metadata

In [133]:
metadata = pd.read_csv('../data/IPX0002106001/patient_metadata.txt', sep='\t')

In [134]:
metadata.rename(columns = {"Group": "Condition"}, inplace = True) 
metadata.rename(columns = {"Unnamed: 0": "patient"}, inplace = True) 

In [135]:
metadata.columns

Index(['patient', 'Condition', 'Sex', 'Age', 'BMI'], dtype='object')

In [136]:
metadata

Unnamed: 0,patient,Condition,Sex,Age,BMI
0,XG1,non-Severe,Male,38,72
1,XG2,non-Severe,Male,36,46
2,XG3,non-Severe,Male,70,51
3,XG4,non-Severe,Male,33,66
4,XG5,non-Severe,Male,33,52
...,...,...,...,...,...
113,HC24,Healthy,Female,38,5
114,HC25,Healthy,Female,44,10
115,HC26,Healthy,Male,28,70
116,HC27,Healthy,Male,38,82


### Structuring the Metabolite data

In [137]:
metabolites = pd.read_csv('../data/IPX0002106001/metabolites.txt', sep='\t')

In [138]:
metabolites.rename(columns = {"Unnamed: 0": "patient"}, inplace = True) 

In [139]:
metabolites = metabolites.T
metabolites.columns = metabolites.iloc[0]
metabolites = metabolites[1:]

In [140]:
metabolites

patient,(14 or 15)-methylpalmitate (a17:0 or i17:0),(16 or 17)-methylstearate (a19:0 or i19:0),(2 or 3)-decenoate (10:1n7 or n8),"(2,4 or 2,5)-dimethylphenol sulfate",(R)-3-hydroxybutyrylcarnitine,(S)-3-hydroxybutyrylcarnitine,(S)-a-amino-omega-caprolactam,1-(1-enyl-oleoyl)-GPE (P-18:1)*,1-(1-enyl-palmitoyl)-2-arachidonoyl-GPC (P-16:0/20:4)*,1-(1-enyl-palmitoyl)-2-arachidonoyl-GPE (P-16:0/20:4)*,...,valine,valsartan,valylglycine,vanillactate,vanillic acid glycine,vanillic alcohol sulfate,vanillylmandelate (VMA),xanthine,xanthosine,xanthurenate
XG1,12287208.0,1516479.5,1165411.75,27345.4199,,685162.1875,,2336275.75,37113956.0,18988858.0,...,542268096.0,,383362.7188,,35317.8672,,118960.4453,27888098.0,,
XG2,8759014.0,1024312.625,350098.875,90617.0,,192375.125,1000798.5,,16333119.0,8128851.5,...,463664544.0,,260476.9688,42393.3555,88918.5156,,155431.125,11145432.0,,
XG3,7412357.0,830819.625,869458.125,,28797.8145,104754.8047,563600.875,761054.1875,21150526.0,12264365.0,...,522942304.0,,168115.3438,,17789.0762,,188695.8594,10964516.0,52828.0859,
XG4,7732122.0,960255.25,656987.6875,,59604.2031,113108.8984,,,12730246.0,3584093.75,...,324137376.0,,96054.5703,43287.7539,,,94876.2188,8384493.5,44683.1523,
XG5,9523999.0,1074755.625,1002209.75,,188893.0313,33130.3906,230701.2969,,11417010.0,,...,647586624.0,,252791.0313,30294.9844,,,82999.4688,16839694.0,33780.4805,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HC21,23069360.0,3914369.25,385073.5313,80074.3906,334862.1875,161101.1094,821534.625,1996888.5,34378460.0,21325232.0,...,778170176.0,,255246.2031,33706.8555,54509.2891,,275012.125,12546437.0,,
HC22,15931674.0,2573461.25,936223.0,87698.4766,,129839.5391,591396.3125,2149380.5,28708918.0,22622132.0,...,835631424.0,,297866.1563,,72499.625,,182799.4063,19929702.0,54224.9805,
HC23,9796969.0,1019916.188,,86051.6484,213416.8906,189071.4063,785787.1875,1384169.625,32048208.0,20300660.0,...,695492800.0,,376211.125,44411.6484,27215.4961,,207670.0469,13685490.0,,
HC24,8996918.0,1243547.375,897352.1875,111980.1719,244450.7656,323236.25,1011555.25,1890843.0,24575472.0,12957257.0,...,895130368.0,,359051.9063,27414.959,209766.2188,121781.9531,365662.4688,20889018.0,,


### Merging the metabolite data with metadata to ensure consistency

In [141]:
data = pd.merge(metadata, metabolites, left_on="patient", right_index=True)

In [142]:
data

Unnamed: 0,patient,Condition,Sex,Age,BMI,(14 or 15)-methylpalmitate (a17:0 or i17:0),(16 or 17)-methylstearate (a19:0 or i19:0),(2 or 3)-decenoate (10:1n7 or n8),"(2,4 or 2,5)-dimethylphenol sulfate",(R)-3-hydroxybutyrylcarnitine,...,valine,valsartan,valylglycine,vanillactate,vanillic acid glycine,vanillic alcohol sulfate,vanillylmandelate (VMA),xanthine,xanthosine,xanthurenate
0,XG1,non-Severe,Male,38,72,12287208.0,1516479.5,1165411.75,27345.4199,,...,542268096.0,,383362.7188,,35317.8672,,118960.4453,27888098.0,,
1,XG2,non-Severe,Male,36,46,8759014.0,1024312.625,350098.875,90617.0,,...,463664544.0,,260476.9688,42393.3555,88918.5156,,155431.125,11145432.0,,
2,XG3,non-Severe,Male,70,51,7412357.0,830819.625,869458.125,,28797.8145,...,522942304.0,,168115.3438,,17789.0762,,188695.8594,10964516.0,52828.0859,
3,XG4,non-Severe,Male,33,66,7732122.0,960255.25,656987.6875,,59604.2031,...,324137376.0,,96054.5703,43287.7539,,,94876.2188,8384493.5,44683.1523,
4,XG5,non-Severe,Male,33,52,9523999.0,1074755.625,1002209.75,,188893.0313,...,647586624.0,,252791.0313,30294.9844,,,82999.4688,16839694.0,33780.4805,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,HC21,Healthy,Male,55,22,23069360.0,3914369.25,385073.5313,80074.3906,334862.1875,...,778170176.0,,255246.2031,33706.8555,54509.2891,,275012.125,12546437.0,,
111,HC22,Healthy,Male,51,65,15931674.0,2573461.25,936223.0,87698.4766,,...,835631424.0,,297866.1563,,72499.625,,182799.4063,19929702.0,54224.9805,
112,HC23,Healthy,Male,43,22,9796969.0,1019916.188,,86051.6484,213416.8906,...,695492800.0,,376211.125,44411.6484,27215.4961,,207670.0469,13685490.0,,
113,HC24,Healthy,Female,38,5,8996918.0,1243547.375,897352.1875,111980.1719,244450.7656,...,895130368.0,,359051.9063,27414.959,209766.2188,121781.9531,365662.4688,20889018.0,,


## Data Preprocessing

### We first preprocess metadata
#### normalize continous variables
#### perform one hot encoding on categorical variable

In [202]:
# First 5 columns (ID + metadata)
metadata = data.iloc[:, 2:5]

In [203]:
# Standardizing continuous columns (Age, BMI)
continuous_features = ['Age', 'BMI']
scaler = StandardScaler()
metadata[continuous_features] = scaler.fit_transform(metadata[continuous_features])

In [204]:
# One-hot encode categorical columns (Sex)
metadata = pd.get_dummies(metadata, columns=['Sex'], drop_first=True)

In [211]:
print(metadata.dtypes)

Age         float64
BMI         float64
Sex_Male       bool
dtype: object


### Preprocssing metabolite
#### First ensure all values are numeric
#### impute missing values with median
#### normalize the data

In [191]:
# The metabolite data (941 columns) from the 6th column onward
metabolites = data.iloc[:, 5:]

In [194]:
metabolites = metabolites.apply(pd.to_numeric, errors='coerce')

In [196]:
metabolites = metabolites.apply(lambda x: x.fillna(x.mean()), axis=0)

In [197]:
# Normalize metabolite data
metabolite_scaler = StandardScaler()
metabolites = metabolite_scaler.fit_transform(metabolites)

In [227]:
# Combine the metadata with metabolite data
X = np.hstack([metadata, metabolites])

In [228]:
X = X.astype(np.float32)

In [229]:
X = torch.tensor(X, dtype=torch.float32)

In [230]:
X.shape

torch.Size([96, 944])

## Encoding the target variable

In [222]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Convert categorical labels to numeric
Y = label_encoder.fit_transform(data['Condition'])

# Convert to PyTorch tensor
Y = torch.tensor(Y, dtype=torch.float32).view(-1, 1)

In [231]:
Y.shape

torch.Size([96, 1])