This is a script where we generates multiple versions of models and compare their perfromance.
To generate and save the final model, use another script named "run_modeling_main.ipynb"

In [1]:
import random
import torch.nn as nn

import utils
import modeling_utils
from importlib import reload

In [2]:
local_folder = "./data/"
truthset_positive = f"{local_folder}/truthset_positives.240227.tsv"
truthset_negative = f"{local_folder}/truthset_negatives.240227.tsv"
label_columns = ["score.(iPSC)", "score.(MB1)", "score.(Gaertner)"]

labels = {}
utils.read_labels(data_path=truthset_negative, labels=labels, label_columns=label_columns)
utils.read_labels(data_path=truthset_positive, labels=labels, label_columns=label_columns)

('chr4', '164877331', '164879008', '+', '164877331-164877407|164878919-164879008')


In [3]:
group_names = ["iPSC", "Gaertner", "MB1"]
experiment_groups = {}
for group_name in group_names:
    experiment_groups[group_name] = []
    for line in open(f"{local_folder}/{group_name}.txt"):
        experiment_groups[group_name].append(line.strip())

features = {}
for group_name, experiment_name_list in experiment_groups.items():
    features[group_name] = []
    for experiment_name in experiment_name_list:
        utils.download_feature_set(
            experiment_name=experiment_name,
            local_folder="./data/")
        utils.read_features(
            data_path=f"{local_folder}/{experiment_name}_orf_features.csv",
            features = features[group_name],
            labels=labels,
            label_name=f"score.({group_name})")
        utils.remove_local_copy(experiment_name, local_folder)

download: s3://velia-piperuns-dev/VPR_orfcalling_20240302075301_iPSC-rep1_SRR9113064/output/VPR_orfcalling_20240302075301_iPSC-rep1_SRR9113064_orf_features.csv to data/VPR_orfcalling_20240302075301_iPSC-rep1_SRR9113064_orf_features.csv
206 2998
download: s3://velia-piperuns-dev/VPR_orfcalling_20240302075301_iPSC-rep2_SRR9113065/output/VPR_orfcalling_20240302075301_iPSC-rep2_SRR9113065_orf_features.csv to data/VPR_orfcalling_20240302075301_iPSC-rep2_SRR9113065_orf_features.csv
206 2992
download: s3://velia-piperuns-dev/VPR_orfcalling_20240302075301_iPSC-rep3_SRR9113066/output/VPR_orfcalling_20240302075301_iPSC-rep3_SRR9113066_orf_features.csv to data/VPR_orfcalling_20240302075301_iPSC-rep3_SRR9113066_orf_features.csv
204 2942
download: s3://velia-piperuns-dev/VPR_orfcalling_20240302173017_SRX7666669/output/VPR_orfcalling_20240302173017_SRX7666669_orf_features.csv to data/VPR_orfcalling_20240302173017_SRX7666669_orf_features.csv
211 3025
download: s3://velia-piperuns-dev/VPR_orfcalling_2

In [4]:
def run(feature_list):
    validation_chroms = ["chr1", "chr7", "chr14", "chr21"]
    test_chroms = ["chr2", "chr8", "chr15", "chr22"]
    
    (training_data_x, training_data_y,
     validation_data_x, validation_data_y,
     test_data_x, test_data_y) = utils.get_dataset(feature_list, validation_chroms, test_chroms)
    model = modeling.train_model(
        training_data_x, training_data_y, validation_data_x, validation_data_y)
    return model

def evaluate(model, feature_list, experiment_name):
    validation_chroms = ["chr1", "chr7", "chr14", "chr21"]
    test_chroms = ["chr2", "chr8", "chr15", "chr22"]
    
    (_, _, _, _, test_data_x, test_data_y) = utils.get_dataset(feature_list, validation_chroms, test_chroms)
    modeling.evaluate_model(nn.BCELoss(), model, test_data_x, test_data_y,
                            plot=True, plot_folder="./plots", experiment_name=experiment_name)

# generate models with different training data combinations

In [5]:
feature_list = features["Gaertner"] + features["iPSC"]
random.shuffle(feature_list)
model_Gaertner_iPSC = run(feature_list)

NameError: name 'modeling' is not defined

In [None]:
feature_list = features["Gaertner"]
random.shuffle(feature_list)
model_Gaertner = run(feature_list)

In [None]:
feature_list = features["iPSC"]
random.shuffle(feature_list)
model_iPSC = run(feature_list)

# Evaluate different models on the same test data

In [None]:
#test_loss, roc_auc, pr_auc = modeling.evaluate_model(nn.BCELoss(), best_model, test_data_x, test_data_y)

In [None]:
evaluate(model=model_Gaertner_iPSC,
         feature_list=features["MB1"],
         experiment_name="HCT116_model_pancreatic_progenitors_iPSC")

In [None]:
evaluate(model=model_Gaertner,
         feature_list=features["MB1"],
         experiment_name="HCT116_model_pancreatic_progenitors")

In [None]:
evaluate(model=model_iPSC,
         feature_list=features["MB1"],
         experiment_name="HCT116_model_iPSC")