In [179]:
import numpy as np
import json  
import pandas as pd 

#import matplotlib.pyplot as plt
#import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
# Custom Collate Function with Padding and Masking
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split 
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, confusion_matrix, roc_curve, precision_recall_curve

In [180]:
# Load your labels
df_labels = pd.read_csv('data.info.txt', delimiter=',')

labels = df_labels['label'].tolist()
labels = list(df_labels['label'])

gene_id = list(df_labels['gene_id'])
transcript_id = list(df_labels['transcript_id'])
transcript_pos = list(df_labels['transcript_position'])

In [181]:
# Load your data
data_list = []
with open('dataset0.json', 'r') as file:
    for line in file:
        try:
            data = json.loads(line) 
            data_list.append(data)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON on line: {line}")
            print(f"Error message: {e}")

In [182]:
# Build bags
bags = [] 
for dat in (data_list):
    for _, pos in dat.items():  
        for _, seq in pos.items():  
            for _, measurements in seq.items(): 
                bag = []
                for read in measurements:
                    instance = np.array(read).reshape(9,) 
                    bag.append(instance)
                if len(bag) == 0:
                    print(bag) 
                bags.append(bag)

In [183]:
print(len(bags))
print(len(bags[0]))
print(len(bags[0][0]))

121838
185
9


In [73]:
df = pd.DataFrame({
    'gene_id': gene_id,
    'transcript_id': transcript_id,
    'transcript_position': transcript_pos,
    'bags': bags,
    'label': labels
})

In [184]:
def sample_with_ratio(group):
    label_1 = group[group['label'] == 1]
    label_0 = group[group['label'] == 0]

    if len(label_0) == 0:
        sample_label_1 = label_1.sample(n=min(len(label_1), 5), random_state=42)
        sample_label_0 = pd.DataFrame()  # No label_0 data
    else:
        max_label_1 = 5 * len(label_0)
        sample_label_1 = label_1.sample(n=min(len(label_1), max_label_1), random_state=42)
        sample_label_0 = label_0.sample(n=min(len(label_0), 2), random_state=42)

    combined_sample = pd.concat([sample_label_1, sample_label_0])
    if len(combined_sample) > 10:
        combined_sample = combined_sample.sample(n=10, random_state=42)
    
    sampled_indices = combined_sample.index
    remaining_data = group.drop(sampled_indices)
    return combined_sample, remaining_data

In [185]:
print("data loaded")
print(len(bags))
print(len(labels))
print(len(df))
labels_series = pd.Series(labels)
print(labels_series.value_counts())

data loaded
121838
121838
121838
0    116363
1      5475
Name: count, dtype: int64


In [186]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cpu


In [195]:
from torch.nn.utils.rnn import pad_sequence

def mil_collate_fn(batch):
    bags = [item[0] for item in batch]  # List of tensors
    labels = torch.tensor([item[1] for item in batch], dtype=torch.float32)
    
    # Pad the sequences
    padded_bags = pad_sequence(bags, batch_first=True)  # Shape: [batch_size, max_length, input_dim]
    
    # Create masks
    lengths = torch.tensor([bag.size(0) for bag in bags])
    max_length = padded_bags.size(1)
    masks = torch.arange(max_length).expand(len(bags), max_length) < lengths.unsqueeze(1)
    masks = masks.float()
    
    return padded_bags, masks


In [196]:
# Data Split
bags_train, bags_temp, labels_train, labels_temp = train_test_split(
    bags, labels, test_size=0.4, random_state=42)
bags_val, bags_test, labels_val, labels_test = train_test_split(
    bags_temp, labels_temp, test_size=0.5, random_state=42)

bags_train_pos = []
labels_train_pos = []
bags_train_neg = []
labels_train_neg = []

for bag, label in zip(bags_train, labels_train):
    if label == 1:
        bags_train_pos.append(bag)
        labels_train_pos.append(label)
    else:
        bags_train_neg.append(bag)
        labels_train_neg.append(label)

num_pos = len(labels_train_pos)
num_neg = len(labels_train_neg)
print(f"Number of positive samples in training data: {num_pos}")
print(f"Number of negative samples in training data: {num_neg}")

Number of positive samples in training data: 3300
Number of negative samples in training data: 69802


In [197]:
desired_num_pos = 15000 
desired_num_neg = 40000

# Oversample positive samples
bags_train_pos_resampled, labels_train_pos_resampled = resample(
    bags_train_pos,
    labels_train_pos,
    replace=True,  # Sample with replacement
    n_samples=desired_num_pos,
    random_state=42
)
# Undersample neg examples
bags_train_neg_resampled, labels_train_neg_resampled = resample(
    bags_train_neg,
    labels_train_neg,
    replace=False,  # Sample without replacement
    n_samples=desired_num_neg,
    random_state=42
)

bags_train_resampled = bags_train_pos_resampled + bags_train_neg_resampled
labels_train_resampled = labels_train_pos_resampled + labels_train_neg_resampled


In [198]:
print(len(bags_train_pos_resampled))
print(len(bags_train_neg_resampled))

15000
40000


In [199]:
# DataLoaders with multiple workers
from dataloader import MILDataset

batch_size = 1 # because we want the output tensor to only process one bag at a time 
train_dataset = MILDataset(bags_train, labels_train)
val_dataset = MILDataset(bags_val, labels_val)
test_dataset = MILDataset(bags_test, labels_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          collate_fn=mil_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        collate_fn=mil_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                         collate_fn=mil_collate_fn)

In [201]:
print(next(iter(test_loader))[0].shape)

torch.Size([1, 50, 9])


In [82]:
from model import MILModel

In [202]:
# Compute class weights
from sklearn.utils.class_weight import compute_class_weight
labels_train_array = np.array(labels_train)
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels_train_array), y=labels_train_array)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
pos_weight = class_weights[1] / class_weights[0]

In [203]:
# Define loss function and move model to device
class FocalLoss(nn.Module):
    def __init__(self, alpha, gamma):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()

In [24]:
class WeightedFocalLoss(nn.Module):
    "Non weighted version of Focal Loss"
    def __init__(self, alpha=.25, gamma=2):
        super(WeightedFocalLoss, self).__init__()
        self.alpha = torch.tensor([alpha, 1-alpha])
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        targets = targets.type(torch.long)
        at = self.alpha.gather(0, targets.data.view(-1))
        pt = torch.exp(-BCE_loss)
        F_loss = at*(1-pt)**self.gamma * BCE_loss
        return F_loss.mean()

In [204]:
import torch.optim as optim

# Initialize Model, Loss, Optimizer
input_dim = 9
hidden_dim = 128 # og was 64
learning_rate = 0.001

num_epochs = 1
threshold = 0.5

weight_decay = 10e-5
alpha = 0.25
gamma = 2

#criterion = FocalLoss(alpha=alpha, gamma=gamma)
criterion = FocalLoss(alpha=alpha, gamma=gamma)
# criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight]).to(device))
model = MILModel(input_dim, hidden_dim).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), weight_decay=weight_decay)

In [87]:
# training 
for epoch in range(num_epochs):
    model.train()
    train_losses = []

    for bags, labels, masks in train_loader:
        bags, labels, masks = bags.to(device), labels.to(device), masks.to(device)  # Move to device
        # reset gradients
        optimizer.zero_grad()
        outputs = model(bags, masks)
        #print("BAGS", bags)
        #print("OUTPUTS", outputs)
        # calculate loss and metrics
        loss = criterion(outputs, labels.float())
        # backward pass
        loss.backward()
        # step
        optimizer.step()
        train_losses.append(loss.item())
    
    # Validation
    model.eval()
    with torch.no_grad():
        all_labels = []
        all_outputs = []
        val_losses = []
        for bags, labels, masks in val_loader:
            bags, labels, masks = bags.to(device), labels.to(device), masks.to(device) 
            outputs = model(bags, masks)
            outputs = torch.sigmoid(outputs)

            loss = criterion(outputs, labels.float())
            val_losses.append(loss.item())

            all_labels.extend(labels.cpu().numpy())
            all_outputs.extend(outputs.cpu().numpy())
        
        print(all_labels)
        print(all_outputs)

        roc_auc = roc_auc_score(all_labels, all_outputs)
        pr_auc = average_precision_score(all_labels, all_outputs)
        acc = accuracy_score(all_labels, (np.array(all_outputs) > threshold).astype(int))
        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Train Loss: {np.mean(train_losses):.4f}, "
              f"Val Loss: {np.mean(val_losses):.4f}, "
              f"Val ROC-AUC: {roc_auc:.4f}, "
              f"Val PR-AUC: {pr_auc:.4f}, "
              f"Val Accuracy: {acc:.4f}")

torch.save(model.state_dict(), 'model_weights_yune.pth')


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,

In [88]:
# yune's trained weights

model.load_state_dict(torch.load('model_weights_yune.pth', map_location=torch.device('cpu')))

# Test Evaluation
model.eval()
with torch.no_grad():
    all_labels = []
    all_outputs = []
    for bags, labels, masks in test_loader:
        bags, labels, masks = bags.to(device), labels.to(device), masks.to(device) 
        outputs = model(bags, masks)
        outputs = torch.sigmoid(outputs)
        
        all_labels.extend(labels.cpu().numpy())
        all_outputs.extend(outputs.cpu().numpy())

    roc_auc = roc_auc_score(all_labels, all_outputs)
    pr_auc = average_precision_score(all_labels, all_outputs)
    acc = accuracy_score(all_labels, (np.array(all_outputs) > threshold).astype(int))
    cm = confusion_matrix(all_labels, (np.array(all_outputs) > threshold).astype(int))
    print(f"\nTest Set Results:\n"
          f"ROC-AUC: {roc_auc:.4f}\n"
          f"PR-AUC: {pr_auc:.4f}\n"
          f"Accuracy: {acc:.4f}\n"
          f"Confusion Matrix:\n{cm}")


Test Set Results:
ROC-AUC: 0.5633
PR-AUC: 0.0478
Accuracy: 0.9547
Confusion Matrix:
[[23264     0]
 [ 1104     0]]


In [206]:
# yucai's trained weights

model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))

# Test Evaluation
model.eval()
with torch.no_grad():
    all_labels = []
    all_outputs = []
    for bags_batch, masks_batch in test_loader:
        bags_batch = bags_batch.to(device)
        #print(bags_batch.shape)
        #labels_batch = labels_batch.to(device)
        masks_batch = masks_batch.to(device)
        outputs = model(bags_batch, masks_batch)
        outputs = torch.sigmoid(outputs)
        #all_labels.extend(labels_batch.cpu().numpy())
        all_outputs.extend(outputs.cpu().numpy())

    #roc_auc = roc_auc_score(all_labels, all_outputs)
    #pr_auc = average_precision_score(all_labels, all_outputs)
    #acc = accuracy_score(all_labels, (np.array(all_outputs) > threshold).astype(int))
    #cm = confusion_matrix(all_labels, (np.array(all_outputs) > threshold).astype(int))
    #print(f"\nTest Set Results:\n"
    #      f"ROC-AUC: {roc_auc:.4f}\n"
    #      f"PR-AUC: {pr_auc:.4f}\n"
    #      f"Accuracy: {acc:.4f}\n"
    #      f"Confusion Matrix:\n{cm}")

In [207]:
print(all_outputs[:10])

[0.22935867, 0.364185, 0.1765395, 0.5554328, 0.47340724, 0.29211795, 0.24166879, 0.3989616, 0.14482383, 0.1925824]


DATASET1.json

In [35]:
# Load your data
import json

data_list = []
with open('dataset2.json', 'r') as file:
    for line in file:
        try:
            data = json.loads(line) 
            data_list.append(data)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON on line: {line}")
            print(f"Error message: {e}")

In [36]:
# Build bags
import numpy as np 

transcript_ids = []
transcript_positions = []
bags = [] 

for dat in (data_list):
    for transcript_id, pos in dat.items(): 
        transcript_ids.append(transcript_id) 
        for transcript_pos, seq in pos.items():
            transcript_positions.append(transcript_pos)
            for _, measurements in seq.items(): 
                bag = []
                for read in measurements:
                    instance = np.array(read).reshape(9,) 
                    bag.append(instance)
                if len(bag) == 0:
                    print(bag) 
                bags.append(bag)

In [37]:
print(len(transcript_ids))
print(len(bags))
print(len(bags[0]))
print(len(bags[0][0]))

1323
1323
885
9


In [38]:
from torch.nn.utils.rnn import pad_sequence

def mil_collate_fn(batch):
    bags = [item for item in batch]  # List of tensors
    #labels = torch.tensor([item[1] for item in batch], dtype=torch.float32)
    
    # Pad the sequences
    padded_bags = pad_sequence(bags, batch_first=True)  # Shape: [batch_size, max_length, input_dim]
    
    # Create masks
    lengths = torch.tensor([bag.size(0) for bag in bags])
    max_length = padded_bags.size(1)
    masks = torch.arange(max_length).expand(len(bags), max_length) < lengths.unsqueeze(1)
    masks = masks.float()
    
    return padded_bags, masks


In [39]:
from dataloader import MILDataset
from torch.utils.data import DataLoader

batchsize = 1

new_test_dataset = MILDataset(bags, None)

new_test_loader = DataLoader(new_test_dataset, batch_size=1, shuffle=False,
                             collate_fn=mil_collate_fn)

In [40]:
import torch 

print(next(iter(new_test_loader))[0].shape)

torch.Size([1, 885, 9])


In [41]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cpu


In [42]:
import torch.nn as nn

class FocalLoss(nn.Module):
    def __init__(self, alpha, gamma):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()

In [43]:
import torch.optim as optim
from model import MILModel

# Initialize Model, Loss, Optimizer
input_dim = 9
hidden_dim = 128 # og was 64
learning_rate = 0.001

num_epochs = 1
threshold = 0.5

weight_decay = 10e-5
alpha = 0.25
gamma = 2

#criterion = FocalLoss(alpha=alpha, gamma=gamma)
criterion = FocalLoss(alpha=alpha, gamma=gamma)
# criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight]).to(device))
model = MILModel(input_dim, hidden_dim).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), weight_decay=weight_decay)

In [46]:
model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))

# Test Evaluation
model.eval()
with torch.no_grad():
    #all_labels = []
    all_outputs = []
    for bags_batch, masks_batch in new_test_loader:
        bags_batch = bags_batch.to(device)
        masks_batch = masks_batch.to(device)

        outputs = model(bags_batch, masks_batch)
        outputs = torch.sigmoid(outputs)
        
        #all_labels.extend(labels_batch.cpu().numpy())
        all_outputs.extend(outputs.cpu().numpy())

In [47]:
import pandas as pd 

df_dataset = pd.DataFrame({
    'transcript_id': transcript_ids,
    'transcript_position': transcript_positions,
    'score': all_outputs,
})

In [176]:
all_outputs[10:20]

[0.26891035,
 0.21195394,
 0.512911,
 0.20363413,
 0.12462013,
 0.24722315,
 0.58704615,
 0.49325323,
 0.45112586,
 0.47347116]

In [48]:
df_dataset.head()

Unnamed: 0,transcript_id,transcript_position,score
0,tx_id_0,0,0.296838
1,tx_id_0,10,0.626811
2,tx_id_0,20,0.712879
3,tx_id_0,30,0.607105
4,tx_id_0,40,0.611411


In [49]:
filename = 'dataset2_nn.csv'

In [50]:
df_dataset.to_csv(filename, index=False)

Check csv score

In [55]:
import pandas as pd

# Load the CSV file into a DataFrame
filename = 'dataset0_nn.csv'  # Replace with your actual file path
df = pd.read_csv(filename)

print(len(df['score']))

# Calculate the average of the 'score' column
average_score = df['score'].mean()

# Print the average score
print(f"The average score is: {average_score}")


121838
The average score is: 0.29065197365745704


In [65]:
import pandas as pd

# Load the CSV file into a DataFrame
#filename = "../Team Project/intermediate_submissions/output_xgb.csv"  # Replace with your actual file path


filename = "../Team Project/intermediate_submissions/transcript_scores_dataset2_xgb.csv"
df = pd.read_csv(filename)

len(df)

1323

In [56]:
import pandas as pd

# Load the CSV file into a DataFrame
filename = 'dataset1_nn.csv'  # Replace with your actual file path
df = pd.read_csv(filename)

print(len(df['score']))

# Calculate the average of the 'score' column
average_score = df['score'].mean()

# Print the average score
print(f"The average score is: {average_score}")

90810
The average score is: 0.30473912906053713


In [64]:
import pandas as pd

# Load the CSV file into a DataFrame
filename = "../Team Project/intermediate_submissions/dataset2_nn.csv"  # Replace with your actual file path
df = pd.read_csv(filename)

print(len(df['score']))

# Calculate the average of the 'score' column
average_score = df['score'].mean()

# Print the average score
print(f"The average score is: {average_score}")

1323
The average score is: 0.3739245165349584
