This is the script where you train and save the best model we can get, with all the data available.
In this script, since there is no need to evaluate the model performance,
we divide our dataset into only the training data and validation data, but no test data. 

In [1]:
import random
import torch
import torch.nn as nn

import utils
import modeling_utils
from importlib import reload

In [2]:
local_folder = "./data/"
truthset_positive = f"{local_folder}/truthset_positives.240227.tsv"
truthset_negative = f"{local_folder}/truthset_negatives.240227.tsv"
label_columns = ["score.(iPSC)", "score.(MB1)", "score.(Gaertner)"]

labels = {}
utils.read_labels(data_path=truthset_negative, labels=labels, label_columns=label_columns)
utils.read_labels(data_path=truthset_positive, labels=labels, label_columns=label_columns)

('chr4', '164877331', '164879008', '+', '164877331-164877407|164878919-164879008')


In [3]:
group_names = ["iPSC", "Gaertner", "MB1"]
experiment_groups = {}
for group_name in group_names:
    experiment_groups[group_name] = []
    for line in open(f"{local_folder}/{group_name}.txt"):
        experiment_groups[group_name].append(line.strip())

features = {}
for group_name, experiment_name_list in experiment_groups.items():
    features[group_name] = []
    for experiment_name in experiment_name_list:
        utils.download_feature_set(
            experiment_name=experiment_name,
            local_folder="./data/")
        utils.read_features(
            data_path=f"{local_folder}/{experiment_name}_orf_features.csv",
            features = features[group_name],
            labels=labels,
            label_name=f"score.({group_name})")
        utils.remove_local_copy(experiment_name, local_folder)

download: s3://velia-piperuns-dev/VPR_orfcalling_20240302075301_iPSC-rep1_SRR9113064/output/VPR_orfcalling_20240302075301_iPSC-rep1_SRR9113064_orf_features.csv to data/VPR_orfcalling_20240302075301_iPSC-rep1_SRR9113064_orf_features.csv
206 2998
download: s3://velia-piperuns-dev/VPR_orfcalling_20240302075301_iPSC-rep2_SRR9113065/output/VPR_orfcalling_20240302075301_iPSC-rep2_SRR9113065_orf_features.csv to data/VPR_orfcalling_20240302075301_iPSC-rep2_SRR9113065_orf_features.csv
206 2992
download: s3://velia-piperuns-dev/VPR_orfcalling_20240302075301_iPSC-rep3_SRR9113066/output/VPR_orfcalling_20240302075301_iPSC-rep3_SRR9113066_orf_features.csv to data/VPR_orfcalling_20240302075301_iPSC-rep3_SRR9113066_orf_features.csv
204 2942
download: s3://velia-piperuns-dev/VPR_orfcalling_20240302173017_SRX7666669/output/VPR_orfcalling_20240302173017_SRX7666669_orf_features.csv to data/VPR_orfcalling_20240302173017_SRX7666669_orf_features.csv
211 3025
download: s3://velia-piperuns-dev/VPR_orfcalling_2

In [4]:
def run(feature_list):
    validation_chroms = ["chr1", "chr7", "chr14", "chr21"]
    #test_chroms = ["chr2", "chr8", "chr15", "chr22"]
    test_chroms = []
    
    (training_data_x, training_data_y,
     validation_data_x, validation_data_y,
     test_data_x, test_data_y) = utils.get_dataset(feature_list, validation_chroms, test_chroms)
    model = modeling_utils.train_model(
        training_data_x, training_data_y, validation_data_x, validation_data_y)
    return model

In [5]:
feature_list = features["Gaertner"] + features["iPSC"] + features["MB1"]
random.shuffle(feature_list)
best_model = run(feature_list)
torch.save(best_model, "./data/best_model")

input_size: 27
torch.Size([94106, 27]) torch.Size([94106])
Epoch 1, Training Loss: 0.18365041149323041
Validation Loss: 0.1518147548714413
Validation ROC-AUC: 0.9127676403476364
Validation PR-AUC: 0.6070134791361987
Epoch 2, Training Loss: 0.1253520643482913
Validation Loss: 0.1835520079804296
Validation ROC-AUC: 0.9135605302024232
Validation PR-AUC: 0.6010802059799562
Epoch 3, Training Loss: 0.11900042667529115
Validation Loss: 0.1604579210897365
Validation ROC-AUC: 0.9183856347959423
Validation PR-AUC: 0.6081489561764604
Epoch 4, Training Loss: 0.11661083084663451
Validation Loss: 0.1743557488550664
Validation ROC-AUC: 0.9118069580466789
Validation PR-AUC: 0.5831892940078999
Epoch 5, Training Loss: 0.11220083722083707
Validation Loss: 0.16815828632690463
Validation ROC-AUC: 0.9152971033336648
Validation PR-AUC: 0.5927200760374696
Epoch 6, Training Loss: 0.11053271947477968
Validation Loss: 0.1673997686271436
Validation ROC-AUC: 0.9159406649660224
Validation PR-AUC: 0.6096021466370519