<a href="https://colab.research.google.com/github/taral0101/DeepGenePredictor/blob/main/GeneExpress.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing Libraries**

In [1]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler

**Loading and Displaying the Dataset**

In [3]:
file_path = '/content/GSE234080_H1299_kdEIF3G_TPM0.xlsx'
data = pd.read_excel(file_path)

# Display columns to understand the structure of the dataset
print(data.columns)


Index(['gene_symbol', 'tpm_HC5_1', 'tpm_HC5_2', 'tpm_HC5_3', 'tpm_HC5_input_1',
       'tpm_HC5_input_2', 'tpm_HC5_input_3', 'tpm_HNC_1', 'tpm_HNC_2',
       'tpm_HNC_3', 'tpm_HNC_input_1', 'tpm_HNC_input_2', 'tpm_HNC_input_3'],
      dtype='object')


**Preprocessing the Data**

In [4]:
# Assume the last column is the target variable for gene expression
target_column = data.columns[-1]

# Separate features and target, EXCLUDING the first column (gene names)
X = data.iloc[:, 1:].drop(columns=[target_column]).values  # Start from the second column
y = data[target_column].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Normalize the target variable
y_mean = y_train.mean()
y_std = y_train.std()
y_train = (y_train - y_mean) / y_std
y_test = (y_test - y_mean) / y_std

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)


**Creating DataLoaders**

In [5]:
# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


**Defining the Linear Regression Model**

In [6]:
class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

input_dim = X_train.shape[1]
model = LinearRegressionModel(input_dim)


**Setting Loss Function and Optimizer**

In [7]:
# Define the loss function and optimizer with weight decay for regularization
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)


**Training the Model**

In [8]:
# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [10/100], Loss: 0.0003
Epoch [20/100], Loss: 0.0002
Epoch [30/100], Loss: 0.0013
Epoch [40/100], Loss: 0.0006
Epoch [50/100], Loss: 0.1306
Epoch [60/100], Loss: 0.0002
Epoch [70/100], Loss: 0.0005
Epoch [80/100], Loss: 0.0008
Epoch [90/100], Loss: 0.0062
Epoch [100/100], Loss: 0.0002


**Evaluating the Model**

In [9]:
# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor)
    print(f'Test Loss: {test_loss.item():.4f}')


Test Loss: 0.0064


**Implementing K-Fold Cross-Validation**

In [10]:
# K-Fold Cross-Validation
kf = KFold(n_splits=5)
fold_results = []

for fold, (train_index, val_index) in enumerate(kf.split(X)):
    print(f'Fold {fold+1}/{kf.get_n_splits()}')

    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    # Standardize the features
    scaler = StandardScaler()
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_val_fold = scaler.transform(X_val_fold)

    # Normalize the target variable
    y_mean = y_train_fold.mean()
    y_std = y_train_fold.std()
    y_train_fold = (y_train_fold - y_mean) / y_std
    y_val_fold = (y_val_fold - y_mean) / y_std

    # Convert to tensors
    X_train_tensor = torch.tensor(X_train_fold, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_fold, dtype=torch.float32).view(-1, 1)
    X_val_tensor = torch.tensor(X_val_fold, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val_fold, dtype=torch.float32).view(-1, 1)

    # Create TensorDatasets and DataLoaders
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Define the model
    model = LinearRegressionModel(input_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)

    # Training loop for the fold
    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            y_val_pred = model(X_val_tensor)
            val_loss = criterion(y_val_pred, y_val_tensor)

        # Print loss every 10 epochs
        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

    fold_results.append(val_loss.item())

# Average validation loss across folds
average_val_loss = sum(fold_results) / len(fold_results)
print(f'Average Validation Loss: {average_val_loss:.4f}')


Fold 1/5
Epoch [10/100], Train Loss: 0.0017, Val Loss: 0.0152
Epoch [20/100], Train Loss: 0.0010, Val Loss: 0.0125
Epoch [30/100], Train Loss: 0.0033, Val Loss: 0.0081
Epoch [40/100], Train Loss: 0.0001, Val Loss: 0.0093
Epoch [50/100], Train Loss: 0.0002, Val Loss: 0.0061
Epoch [60/100], Train Loss: 0.0011, Val Loss: 0.0120
Epoch [70/100], Train Loss: 0.0003, Val Loss: 0.0052
Epoch [80/100], Train Loss: 0.0004, Val Loss: 0.0075
Epoch [90/100], Train Loss: 0.0001, Val Loss: 0.0078
Epoch [100/100], Train Loss: 0.0002, Val Loss: 0.0056
Fold 2/5
Epoch [10/100], Train Loss: 0.0005, Val Loss: 0.0397
Epoch [20/100], Train Loss: 0.0002, Val Loss: 0.0291
Epoch [30/100], Train Loss: 0.0019, Val Loss: 0.0200
Epoch [40/100], Train Loss: 0.0022, Val Loss: 0.0180
Epoch [50/100], Train Loss: 0.0003, Val Loss: 0.0173
Epoch [60/100], Train Loss: 0.0027, Val Loss: 0.0249
Epoch [70/100], Train Loss: 0.0001, Val Loss: 0.0151
Epoch [80/100], Train Loss: 0.0007, Val Loss: 0.0153
Epoch [90/100], Train Loss:

**Final Evaluation on Test Set**

In [11]:
# Final evaluation on the test set after K-Fold cross-validation
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor)
    print(f'Final Test Loss: {test_loss.item():.4f}')


Final Test Loss: 0.0096
