In [None]:
import os
import pandas as pd
import numpy as np
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# PyTorch Geometric imports
from torch_geometric.data import Data, InMemoryDataset, DataLoader
from torch_geometric.nn import GATConv, global_mean_pool

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

#############################
# Data Preprocessing
#############################

# Load genetic profiles data
genetic_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\new_genetic_profiles.csv"
genetic_df = pd.read_csv(genetic_file_path)
genetic_df.columns = genetic_df.columns.str.strip()
genetic_df.fillna("None", inplace=True)

# Create obesity risk category from Obesity_Risk_Score
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.5, 0.8, 1],
    labels=['Low', 'Medium', 'High']
)

# Create BMI category using cut (simulate gene expression analysis via gene variants)
genetic_df['BMI_Category'] = pd.cut(
    genetic_df['BMI'],
    bins=[0, 18.5, 24.9, 29.9, np.inf],
    labels=['Underweight', 'Normal', 'Overweight', 'Obese']
)

# Encode categorical non-gene features
label_encoders = {}
for col in ["Diet_Type", "Physical_Activity"]:
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Encode gene variant columns (simulate gene expression values by encoding variant info)
variant_columns = ["MC4R_Variant", "PPARG_Variant", "FTO_Variant", "LEPR_Variant"]
for col in variant_columns:
    genetic_df[col] = genetic_df[col].astype(str)
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Define structured features and gene features
structured_features = ["Age", "BMI", "Physical_Activity", "Diet_Type"]
# For gene branch, for each gene we take two features: presence and variant
gene_list = ["MC4R", "PPARG", "FTO", "LEPR"]
gene_feature_cols = []
for gene in gene_list:
    gene_feature_cols.append(f"{gene}_Present")
    gene_feature_cols.append(f"{gene}_Variant")

# Combine into overall features if needed (here, structured features are used separately)
X_structured = genetic_df[structured_features].copy()
X_gene = genetic_df[gene_feature_cols].copy()

# Encode targets
target_le_risk = LabelEncoder()
y_risk = target_le_risk.fit_transform(genetic_df['Obesity_Risk_Category'])
target_le_bmi = LabelEncoder()
y_bmi = target_le_bmi.fit_transform(genetic_df['BMI_Category'])

#############################
# Scale structured features
#############################
scaler = StandardScaler()
X_structured_scaled = scaler.fit_transform(X_structured)

#############################
# Train-test split indices
#############################
train_idx, test_idx = train_test_split(np.arange(len(genetic_df)), test_size=0.2, random_state=42)

#############################
# Create a PyTorch Geometric Dataset
#############################

class GeneticDataset(InMemoryDataset):
    def __init__(self, structured_data, gene_data, y_risk, y_bmi, indices, transform=None):
        self.structured_data = structured_data[indices]
        self.gene_data = gene_data.iloc[indices].reset_index(drop=True)
        self.y_risk = y_risk[indices]
        self.y_bmi = y_bmi[indices]
        super(GeneticDataset, self).__init__('.', transform, None, None)
        self.data, self.slices = self.process_data()
    
    def process_data(self):
        data_list = []
        for i in range(len(self.structured_data)):
            # Structured data: vector of features (already scaled)
            struct_feat = torch.tensor(self.structured_data[i], dtype=torch.float)
            
            # Build gene graph for the individual
            # We have 4 genes, each with 2 features: [Present, Variant]
            gene_feats = self.gene_data.iloc[i].values.astype(np.float32).reshape(len(gene_list), 2)
            x_gene = torch.tensor(gene_feats, dtype=torch.float)
            
            # Define edges for the gene graph.
            # Here we assume a fully connected graph (excluding self-loops)
            num_genes = x_gene.shape[0]
            edge_index = []
            for src in range(num_genes):
                for dst in range(num_genes):
                    if src != dst:
                        edge_index.append([src, dst])
            edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
            
            # Create a Data object for the gene branch
            gene_data_obj = Data(x=x_gene, edge_index=edge_index)
            
            # Combine targets into one Data object (we’ll keep structured data separate)
            data_obj = Data(struct_feat=struct_feat, gene_data=gene_data_obj,
                            y_risk=torch.tensor(self.y_risk[i], dtype=torch.long),
                            y_bmi=torch.tensor(self.y_bmi[i], dtype=torch.long))
            data_list.append(data_obj)
        return self.collate(data_list)
    
    def get(self, idx):
        return super(GeneticDataset, self).get(idx)

# Create train and test datasets
train_dataset = GeneticDataset(X_structured_scaled, X_gene, y_risk, y_bmi, train_idx)
test_dataset = GeneticDataset(X_structured_scaled, X_gene, y_risk, y_bmi, test_idx)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

#############################
# Define the Integrated Model
#############################
class IntegratedModel(nn.Module):
    def __init__(self, structured_input_dim, struct_hidden_dim, 
                 gat_in_dim, gat_hidden_dim, gat_out_dim,
                 combined_hidden_dim, risk_out_dim, bmi_out_dim):
        super(IntegratedModel, self).__init__()
        # Structured branch
        self.fc_struct1 = nn.Linear(structured_input_dim, struct_hidden_dim)
        self.fc_struct2 = nn.Linear(struct_hidden_dim, struct_hidden_dim)
        
        # Graph branch (for gene data)
        # Using one GAT layer; note: each individual graph has 4 nodes, each with gat_in_dim features
        self.gat_conv = GATConv(gat_in_dim, gat_hidden_dim, heads=2, concat=True)
        # Readout: global mean pooling will be applied later
        
        # Combined branch
        # We'll concatenate the structured branch output with the gene branch readout
        self.fc_comb1 = nn.Linear(struct_hidden_dim + gat_hidden_dim * 2, combined_hidden_dim)
        self.fc_comb2 = nn.Linear(combined_hidden_dim, combined_hidden_dim)
        
        # Multi-task heads
        self.fc_risk = nn.Linear(combined_hidden_dim, risk_out_dim)  # e.g., 3 classes
        self.fc_bmi = nn.Linear(combined_hidden_dim, bmi_out_dim)    # e.g., 4 classes

    def forward(self, struct_feat, gene_data, batch):
        # Process structured branch
        x_struct = F.relu(self.fc_struct1(struct_feat))
        x_struct = F.relu(self.fc_struct2(x_struct))
        
        # Process graph branch with GAT
        # gene_data.x: [total_nodes, gat_in_dim], gene_data.edge_index: graph connectivity
        x_gene = self.gat_conv(gene_data.x, gene_data.edge_index)
        x_gene = F.elu(x_gene)
        # Perform global mean pooling over nodes in each graph using the provided batch vector
        x_gene = global_mean_pool(x_gene, batch)
        
        # Combine the two branches
        combined = torch.cat([x_struct, x_gene], dim=1)
        combined = F.relu(self.fc_comb1(combined))
        combined = F.relu(self.fc_comb2(combined))
        
        # Two task outputs
        out_risk = self.fc_risk(combined)
        out_bmi = self.fc_bmi(combined)
        return out_risk, out_bmi

# Define model hyperparameters
structured_input_dim = X_structured_scaled.shape[1]    # e.g., 4 features: Age, BMI, Physical_Activity, Diet_Type
struct_hidden_dim = 32
gat_in_dim = 2   # each gene node: [Present, Variant]
gat_hidden_dim = 8
gat_out_dim = 16  # not used directly as we use heads concat
combined_hidden_dim = 32
risk_out_dim = 3   # Obesity Risk: Low, Medium, High
bmi_out_dim = 4    # BMI: Underweight, Normal, Overweight, Obese

model = IntegratedModel(structured_input_dim, struct_hidden_dim,
                        gat_in_dim, gat_hidden_dim, gat_out_dim,
                        combined_hidden_dim, risk_out_dim, bmi_out_dim)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

#############################
# Training Setup
#############################
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        # batch.struct_feat: [batch_size, structured_input_dim]
        struct_feat = batch.struct_feat.to(device)
        # For gene branch, batch.gene_data is a Data object with x and edge_index.
        # When batching graphs, torch_geometric automatically provides a 'batch' vector.
        gene_data = batch.gene_data.to(device)
        batch_vector = batch.batch.to(device)  # batch vector for global pooling
        
        out_risk, out_bmi = model(struct_feat, gene_data, batch_vector)
        loss_risk = criterion(out_risk, batch.y_risk.to(device))
        loss_bmi = criterion(out_bmi, batch.y_bmi.to(device))
        loss = loss_risk + loss_bmi
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch.num_graphs
    return total_loss / len(loader.dataset)

def test_epoch(model, loader):
    model.eval()
    total_loss = 0
    correct_risk = 0
    correct_bmi = 0
    with torch.no_grad():
        for batch in loader:
            struct_feat = batch.struct_feat.to(device)
            gene_data = batch.gene_data.to(device)
            batch_vector = batch.batch.to(device)
            out_risk, out_bmi = model(struct_feat, gene_data, batch_vector)
            loss_risk = criterion(out_risk, batch.y_risk.to(device))
            loss_bmi = criterion(out_bmi, batch.y_bmi.to(device))
            loss = loss_risk + loss_bmi
            total_loss += loss.item() * batch.num_graphs
            
            pred_risk = out_risk.argmax(dim=1)
            pred_bmi = out_bmi.argmax(dim=1)
            correct_risk += (pred_risk == batch.y_risk.to(device)).sum().item()
            correct_bmi += (pred_bmi == batch.y_bmi.to(device)).sum().item()
    avg_loss = total_loss / len(loader.dataset)
    acc_risk = correct_risk / len(loader.dataset)
    acc_bmi = correct_bmi / len(loader.dataset)
    return avg_loss, acc_risk, acc_bmi

#############################
# Training Loop
#############################
num_epochs = 50
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader)
    test_loss, test_acc_risk, test_acc_bmi = test_epoch(model, test_loader)
    if epoch % 5 == 0:
        print(f"Epoch {epoch}: Train Loss {train_loss:.4f} | Test Loss {test_loss:.4f} | "
              f"Risk Acc {test_acc_risk:.4f} | BMI Acc {test_acc_bmi:.4f}")

#############################
# Save the Integrated Model and Artifacts
#############################
save_dir = r"C:\Users\trejan\Desktop\GNN\Saved models"
os.makedirs(save_dir, exist_ok=True)
torch.save(model.state_dict(), os.path.join(save_dir, "integrated_model.pt"))

with open(os.path.join(save_dir, "label_encoders.pkl"), "wb") as f:
    pickle.dump(label_encoders, f)
with open(os.path.join(save_dir, "target_encoders.pkl"), "wb") as f:
    pickle.dump({'risk': target_le_risk, 'bmi': target_le_bmi}, f)
with open(os.path.join(save_dir, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

print("Integrated model and preprocessing artifacts saved successfully.")


In [None]:
import os
import pandas as pd
import numpy as np
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# PyTorch Geometric imports
from torch_geometric.data import Data, InMemoryDataset, DataLoader
from torch_geometric.nn import GATConv, global_mean_pool

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

#############################
# Data Preprocessing
#############################

# Load genetic profiles data
genetic_file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\new_genetic_profiles.csv"
genetic_df = pd.read_csv(genetic_file_path)
genetic_df.columns = genetic_df.columns.str.strip()
genetic_df.fillna("None", inplace=True)

# Create obesity risk category from Obesity_Risk_Score
genetic_df['Obesity_Risk_Category'] = pd.cut(
    genetic_df['Obesity_Risk_Score'],
    bins=[0, 0.5, 0.8, 1],
    labels=['Low', 'Medium', 'High']
)

# Create BMI category using cut (simulate gene expression analysis via gene variants)
genetic_df['BMI_Category'] = pd.cut(
    genetic_df['BMI'],
    bins=[0, 18.5, 24.9, 29.9, np.inf],
    labels=['Underweight', 'Normal', 'Overweight', 'Obese']
)

# Encode categorical non-gene features
label_encoders = {}
for col in ["Diet_Type", "Physical_Activity"]:
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Encode gene variant columns (simulate gene expression values by encoding variant info)
variant_columns = ["MC4R_Variant", "PPARG_Variant", "FTO_Variant", "LEPR_Variant"]
for col in variant_columns:
    genetic_df[col] = genetic_df[col].astype(str)
    le = LabelEncoder()
    genetic_df[col] = le.fit_transform(genetic_df[col])
    label_encoders[col] = le

# Define structured features and gene features
structured_features = ["Age", "BMI", "Physical_Activity", "Diet_Type"]
# For gene branch, for each gene we take two features: presence and variant
gene_list = ["MC4R", "PPARG", "FTO", "LEPR"]
gene_feature_cols = []
for gene in gene_list:
    gene_feature_cols.append(f"{gene}_Present")
    gene_feature_cols.append(f"{gene}_Variant")

# Combine into overall features if needed (here, structured features are used separately)
X_structured = genetic_df[structured_features].copy()
X_gene = genetic_df[gene_feature_cols].copy()

# Encode targets
target_le_risk = LabelEncoder()
y_risk = target_le_risk.fit_transform(genetic_df['Obesity_Risk_Category'])
target_le_bmi = LabelEncoder()
y_bmi = target_le_bmi.fit_transform(genetic_df['BMI_Category'])

#############################
# Scale structured features
#############################
scaler = StandardScaler()
X_structured_scaled = scaler.fit_transform(X_structured)

#############################
# Train-test split indices
#############################
train_idx, test_idx = train_test_split(np.arange(len(genetic_df)), test_size=0.2, random_state=42)

#############################
# Create a PyTorch Geometric Dataset
#############################

class GeneticDataset(InMemoryDataset):
    def __init__(self, structured_data, gene_data, y_risk, y_bmi, indices, transform=None):
        self.structured_data = structured_data[indices]
        self.gene_data = gene_data.iloc[indices].reset_index(drop=True)
        self.y_risk = y_risk[indices]
        self.y_bmi = y_bmi[indices]
        super(GeneticDataset, self).__init__('.', transform, None, None)
        self.data, self.slices = self.process_data()
    
    def process_data(self):
        data_list = []
        for i in range(len(self.structured_data)):
            # Structured data: vector of features (already scaled)
            struct_feat = torch.tensor(self.structured_data[i], dtype=torch.float)
            
            # Build gene graph for the individual
            # We have 4 genes, each with 2 features: [Present, Variant]
            gene_feats = self.gene_data.iloc[i].values.astype(np.float32).reshape(len(gene_list), 2)
            x_gene = torch.tensor(gene_feats, dtype=torch.float)
            
            # Define edges for the gene graph.
            # Here we assume a fully connected graph (excluding self-loops)
            num_genes = x_gene.shape[0]
            edge_index = []
            for src in range(num_genes):
                for dst in range(num_genes):
                    if src != dst:
                        edge_index.append([src, dst])
            edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
            
            # Create a Data object for the gene branch
            gene_data_obj = Data(x=x_gene, edge_index=edge_index)
            
            # Combine targets into one Data object (we’ll keep structured data separate)
            data_obj = Data(struct_feat=struct_feat, gene_data=gene_data_obj,
                            y_risk=torch.tensor(self.y_risk[i], dtype=torch.long),
                            y_bmi=torch.tensor(self.y_bmi[i], dtype=torch.long))
            data_list.append(data_obj)
        return self.collate(data_list)
    
    def get(self, idx):
        return super(GeneticDataset, self).get(idx)

# Create train and test datasets
train_dataset = GeneticDataset(X_structured_scaled, X_gene, y_risk, y_bmi, train_idx)
test_dataset = GeneticDataset(X_structured_scaled, X_gene, y_risk, y_bmi, test_idx)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

#############################
# Define the Integrated Model
#############################
class IntegratedModel(nn.Module):
    def __init__(self, structured_input_dim, struct_hidden_dim, 
                 gat_in_dim, gat_hidden_dim, gat_out_dim,
                 combined_hidden_dim, risk_out_dim, bmi_out_dim):
        super(IntegratedModel, self).__init__()
        # Structured branch
        self.fc_struct1 = nn.Linear(structured_input_dim, struct_hidden_dim)
        self.fc_struct2 = nn.Linear(struct_hidden_dim, struct_hidden_dim)
        
        # Graph branch (for gene data)
        # Using one GAT layer; note: each individual graph has 4 nodes, each with gat_in_dim features
        self.gat_conv = GATConv(gat_in_dim, gat_hidden_dim, heads=2, concat=True)
        # Readout: global mean pooling will be applied later
        
        # Combined branch
        # We'll concatenate the structured branch output with the gene branch readout
        self.fc_comb1 = nn.Linear(struct_hidden_dim + gat_hidden_dim * 2, combined_hidden_dim)
        self.fc_comb2 = nn.Linear(combined_hidden_dim, combined_hidden_dim)
        
        # Multi-task heads
        self.fc_risk = nn.Linear(combined_hidden_dim, risk_out_dim)  # e.g., 3 classes
        self.fc_bmi = nn.Linear(combined_hidden_dim, bmi_out_dim)    # e.g., 4 classes

    def forward(self, struct_feat, gene_data, batch):
        # Process structured branch
        x_struct = F.relu(self.fc_struct1(struct_feat))
        x_struct = F.relu(self.fc_struct2(x_struct))
        
        # Process graph branch with GAT
        # gene_data.x: [total_nodes, gat_in_dim], gene_data.edge_index: graph connectivity
        x_gene = self.gat_conv(gene_data.x, gene_data.edge_index)
        x_gene = F.elu(x_gene)
        # Perform global mean pooling over nodes in each graph using the provided batch vector
        x_gene = global_mean_pool(x_gene, batch)
        
        # Combine the two branches
        combined = torch.cat([x_struct, x_gene], dim=1)
        combined = F.relu(self.fc_comb1(combined))
        combined = F.relu(self.fc_comb2(combined))
        
        # Two task outputs
        out_risk = self.fc_risk(combined)
        out_bmi = self.fc_bmi(combined)
        return out_risk, out_bmi

# Define model hyperparameters
structured_input_dim = X_structured_scaled.shape[1]    # e.g., 4 features: Age, BMI, Physical_Activity, Diet_Type
struct_hidden_dim = 32
gat_in_dim = 2   # each gene node: [Present, Variant]
gat_hidden_dim = 8
gat_out_dim = 16  # not used directly as we use heads concat
combined_hidden_dim = 32
risk_out_dim = 3   # Obesity Risk: Low, Medium, High
bmi_out_dim = 4    # BMI: Underweight, Normal, Overweight, Obese

model = IntegratedModel(structured_input_dim, struct_hidden_dim,
                        gat_in_dim, gat_hidden_dim, gat_out_dim,
                        combined_hidden_dim, risk_out_dim, bmi_out_dim)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

#############################
# Training Setup
#############################
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        # batch.struct_feat: [batch_size, structured_input_dim]
        struct_feat = batch.struct_feat.to(device)
        # For gene branch, batch.gene_data is a Data object with x and edge_index.
        # When batching graphs, torch_geometric automatically provides a 'batch' vector.
        gene_data = batch.gene_data.to(device)
        batch_vector = batch.batch.to(device)  # batch vector for global pooling
        
        out_risk, out_bmi = model(struct_feat, gene_data, batch_vector)
        loss_risk = criterion(out_risk, batch.y_risk.to(device))
        loss_bmi = criterion(out_bmi, batch.y_bmi.to(device))
        loss = loss_risk + loss_bmi
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch.num_graphs
    return total_loss / len(loader.dataset)

def test_epoch(model, loader):
    model.eval()
    total_loss = 0
    correct_risk = 0
    correct_bmi = 0
    with torch.no_grad():
        for batch in loader:
            struct_feat = batch.struct_feat.to(device)
            gene_data = batch.gene_data.to(device)
            batch_vector = batch.batch.to(device)
            out_risk, out_bmi = model(struct_feat, gene_data, batch_vector)
            loss_risk = criterion(out_risk, batch.y_risk.to(device))
            loss_bmi = criterion(out_bmi, batch.y_bmi.to(device))
            loss = loss_risk + loss_bmi
            total_loss += loss.item() * batch.num_graphs
            
            pred_risk = out_risk.argmax(dim=1)
            pred_bmi = out_bmi.argmax(dim=1)
            correct_risk += (pred_risk == batch.y_risk.to(device)).sum().item()
            correct_bmi += (pred_bmi == batch.y_bmi.to(device)).sum().item()
    avg_loss = total_loss / len(loader.dataset)
    acc_risk = correct_risk / len(loader.dataset)
    acc_bmi = correct_bmi / len(loader.dataset)
    return avg_loss, acc_risk, acc_bmi

#############################
# Training Loop
#############################
num_epochs = 50
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader)
    test_loss, test_acc_risk, test_acc_bmi = test_epoch(model, test_loader)
    if epoch % 5 == 0:
        print(f"Epoch {epoch}: Train Loss {train_loss:.4f} | Test Loss {test_loss:.4f} | "
              f"Risk Acc {test_acc_risk:.4f} | BMI Acc {test_acc_bmi:.4f}")

#############################
# Save the Integrated Model and Artifacts
#############################
save_dir = r"C:\Users\trejan\Desktop\GNN\Saved models"
os.makedirs(save_dir, exist_ok=True)
torch.save(model.state_dict(), os.path.join(save_dir, "integrated_model.pt"))

with open(os.path.join(save_dir, "label_encoders.pkl"), "wb") as f:
    pickle.dump(label_encoders, f)
with open(os.path.join(save_dir, "target_encoders.pkl"), "wb") as f:
    pickle.dump({'risk': target_le_risk, 'bmi': target_le_bmi}, f)
with open(os.path.join(save_dir, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

print("Integrated model and preprocessing artifacts saved successfully.")
