## Microarray Analysis- Machine Learning Analysis
* Date: April 27, 2022 
* Author: Zeal Jinwala (zsj24)
* Description: In this study, you will analyze a Breast Cancer dataset, GSE7390, and identify a gene signature for prediction of Breast Cancer relapse. 

Import packages

In [None]:

import GEOparse
import pandas as pd
import numpy as np

Download and parse data

In [None]:
# Download and parse the dataset. You may use bmes_downloadandparsegse_cached('GSE7390') (which downloads the series file and parses it using geoseriesread()). 
    # You do not need to translate the Probe names to gene IDs; hence, you do not need to download the GPL platform file for this dataset.
gse = GEOparse.get_GEO(geo="GSE7390", destdir=".")

10-May-2022 09:15:18 DEBUG utils - Directory . already exists. Skipping.
10-May-2022 09:15:18 INFO GEOparse - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE7nnn/GSE7390/soft/GSE7390_family.soft.gz to ./GSE7390_family.soft.gz
100%|██████████| 42.7M/42.7M [00:00<00:00, 46.5MB/s]
10-May-2022 09:15:19 DEBUG downloader - Size validation passed
10-May-2022 09:15:19 DEBUG downloader - Moving /tmp/tmpibvn8_q6 to /content/GSE7390_family.soft.gz
10-May-2022 09:15:19 DEBUG downloader - Successfully downloaded ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE7nnn/GSE7390/soft/GSE7390_family.soft.gz
10-May-2022 09:15:19 INFO GEOparse - Parsing ./GSE7390_family.soft.gz: 
10-May-2022 09:15:19 DEBUG GEOparse - DATABASE: GeoMiame
10-May-2022 09:15:19 DEBUG GEOparse - SERIES: GSE7390
10-May-2022 09:15:19 DEBUG GEOparse - PLATFORM: GPL96
10-May-2022 09:15:21 DEBUG GEOparse - SAMPLE: GSM177885
10-May-2022 09:15:21 DEBUG GEOparse - SAMPLE: GSM177886
10-May-2022 09:15:21 DEBUG GEOparse - SAMPLE: GSM177887
10

Get target and expression values

In [None]:
# generate pandas dataframes

cancerRelapseStatus = gse.phenotype_data["characteristics_ch1.14.e.rfs"]
gsedata = pd.concat(map(lambda x: pd.concat([pd.DataFrame(), gse.gsms[x].table.rename(columns={'VALUE':x})],axis=1), gse.gsms.keys()), axis=1)
gsedata = gsedata.loc[:,~gsedata.columns.duplicated()]

In [None]:
# generate numpy arrays

dataset_np = gsedata.transpose().to_numpy()[1:]
labels_np = cancerRelapseStatus.to_numpy(dtype=np.int32)

In [None]:
# standardize the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(dataset_np)
standardized_dataset_np = scaler.transform(dataset_np)

In [None]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(standardized_dataset_np, labels_np, test_size=0.33, random_state=42)

In [None]:
# defining PyTorch datasets and dataloaders
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class TrainData(Dataset):
  def __init__(self, X_data, y_data):
    self.X_data = X_data
    self.y_data = y_data

  def __getitem__(self, index):
      return self.X_data[index], self.y_data[index]
        
  def __len__ (self):
      return len(self.X_data)

class TestData(Dataset):  
  def __init__(self, X_data):
    self.X_data = X_data
        
  def __getitem__(self, index):
    return self.X_data[index]

  def __len__ (self):
    return len(self.X_data)

train_data = TrainData(torch.FloatTensor(X_train), torch.FloatTensor(Y_train))
test_data = TestData(torch.FloatTensor(X_test))
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)


Filter 76 significant genes

In [None]:
formimport torch.nn as nn
from typing import Any

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layer_1 = nn.Linear(22283, 8192) 
        self.layer_2 = nn.Linear(8192, 4096)
        self.layer_3 = nn.Linear(4096, 2048)
        self.layer_4 = nn.Linear(2048, 1024)
        self.layer_5 = nn.Linear(1024, 512) 
        self.layer_6 = nn.Linear(512, 256)
        self.layer_out = nn.Linear(256, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(2048)
        self.batchnorm2 = nn.BatchNorm1d(512)
        self.batchnorm3 = nn.BatchNorm1d(256)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.relu(self.layer_2(x))
        x = self.relu(self.layer_3(x))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_4(x))
        x = self.relu(self.layer_5(x))
        x = self.batchnorm2(x)
        x = self.relu(self.layer_6(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        return x

In [None]:
# training setup
from torch import optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MLP()
model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# accuracy calculation
def binary_acc(y_pred, y_test):
  y_pred_tag = torch.round(torch.sigmoid(y_pred))
  
  correct_results_sum = (y_pred_tag == y_test).sum().float()
  acc = correct_results_sum/y_test.shape[0]
  acc = torch.round(acc * 100)
    
  return acc

In [None]:
# training
model.train()
for e in range(1, 101):    # 100 epochs
  epoch_loss = 0
  epoch_acc = 0
  for X_batch, y_batch in train_loader:
    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
    optimizer.zero_grad()
        
    y_pred = model(X_batch)
        
    loss = criterion(y_pred, y_batch.unsqueeze(1))
    acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
    loss.backward()
    optimizer.step()
        
    epoch_loss += loss.item()
    epoch_acc += acc.item()
        

  print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.79250 | Acc: 48.333
Epoch 002: | Loss: 0.80951 | Acc: 53.333
Epoch 003: | Loss: 0.67062 | Acc: 57.000
Epoch 004: | Loss: 0.73465 | Acc: 52.333
Epoch 005: | Loss: 0.65287 | Acc: 68.667
Epoch 006: | Loss: 0.58932 | Acc: 77.000
Epoch 007: | Loss: 0.63102 | Acc: 61.000
Epoch 008: | Loss: 0.60974 | Acc: 70.333
Epoch 009: | Loss: 0.64668 | Acc: 62.000
Epoch 010: | Loss: 0.65211 | Acc: 64.000
Epoch 011: | Loss: 0.58882 | Acc: 74.667
Epoch 012: | Loss: 0.64138 | Acc: 64.333
Epoch 013: | Loss: 0.52440 | Acc: 64.000
Epoch 014: | Loss: 0.64536 | Acc: 67.000
Epoch 015: | Loss: 0.54837 | Acc: 73.000
Epoch 016: | Loss: 0.62669 | Acc: 58.333
Epoch 017: | Loss: 0.64484 | Acc: 60.000
Epoch 018: | Loss: 0.78008 | Acc: 59.333
Epoch 019: | Loss: 0.63384 | Acc: 68.333
Epoch 020: | Loss: 0.54211 | Acc: 73.333
Epoch 021: | Loss: 0.52320 | Acc: 77.667
Epoch 022: | Loss: 0.54029 | Acc: 75.333
Epoch 023: | Loss: 0.66288 | Acc: 59.000
Epoch 024: | Loss: 0.51200 | Acc: 79.000
Epoch 025: | Los

In [None]:
# testing/validation setup

y_pred_list = []
model.eval()
with torch.no_grad():
  for X_batch in test_loader:
    X_batch = X_batch.to(device)
    y_test_pred = model(X_batch)
    y_test_pred = torch.sigmoid(y_test_pred)
    y_pred_tag = torch.round(y_test_pred)
    y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(Y_test, y_pred_list).ravel()

print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("True Positives:", tp)

True Negatives: 22
False Positives: 11
False Negatives: 19
True Positives: 14


In [None]:
# classification report
from sklearn.metrics import classification_report

print(classification_report(Y_test, y_pred_list))

              precision    recall  f1-score   support

           0       0.54      0.67      0.59        33
           1       0.56      0.42      0.48        33

    accuracy                           0.55        66
   macro avg       0.55      0.55      0.54        66
weighted avg       0.55      0.55      0.54        66

