In [2]:
# check distribution of training set before and after biased downsampling
import pandas as pd
import numpy as npr
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
print('percentage of positive samples in training set:')
all_train = pd.read_csv('/datacommons/igvf-pm/A549/GR-AP1/simulated-seq/data/0.01-positive/all-train-0.01-counts.txt.gz',
                    compression='gzip', header=None, sep='\t', skiprows=1)
all_train.columns = ['DNA', 'RNA']
frac = all_train['RNA'].apply(lambda x: float(x).is_integer()).sum()/len(all_train)
print(f"before downsampling: {frac:.2%}")

unbiased_train = pd.read_csv('/datacommons/igvf-pm/A549/GR-AP1/simulated-seq/data/0.01-positive/train-unbiased-counts.txt.gz',
                    compression='gzip', header=None, sep='\t', skiprows=1)
unbiased_train.columns = ['DNA', 'RNA']
frac = unbiased_train['RNA'].apply(lambda x: float(x).is_integer()).sum()/len(unbiased_train) 
print(f"after unbiased downsampling: {frac:.2%}")

for i in range(1,6):
    biased_train = pd.read_csv('/datacommons/igvf-pm/A549/GR-AP1/simulated-seq/data/0.01-positive/train-lognormal10-'+str(i)+'-counts.txt.gz',
                    compression='gzip', header=None, sep='\t', skiprows=1)
    biased_train.columns = ['DNA', 'RNA']
    frac = biased_train['RNA'].apply(lambda x: float(x).is_integer()).sum()/len(biased_train)
    print(f"after biased downsampling by acceptance function lognormal^10, in sample {i}: {frac:.4%}")
print('')
print('models were trained for 40 epochs, no earlystop')

percentage of positive samples in training set:
before downsampling: 1.00%
after unbiased downsampling: 1.00%
after biased downsampling by acceptance function lognormal^10, in sample 1: 3.5652%
after biased downsampling by acceptance function lognormal^10, in sample 2: 3.5661%
after biased downsampling by acceptance function lognormal^10, in sample 3: 3.5655%
after biased downsampling by acceptance function lognormal^10, in sample 4: 3.5661%
after biased downsampling by acceptance function lognormal^10, in sample 5: 3.5673%

models were trained for 40 epochs, no earlystop


In [15]:
print('percentage of positive samples in training set:')

all_train = pd.read_csv('/datacommons/igvf-pm/A549/GR-AP1/simulated-seq/data/0.003-positive/all-train-0.003-counts.txt.gz',
                    compression='gzip', header=None, sep='\t', skiprows=1)
all_train.columns = ['DNA', 'RNA']
frac = all_train['RNA'].apply(lambda x: float(x).is_integer()).sum()/len(all_train)
print(f"before downsampling: {frac:.2%}")

unbiased_train = pd.read_csv('/datacommons/igvf-pm/A549/GR-AP1/simulated-seq/data/0.003-positive/train-unbiased-counts.txt.gz',
                    compression='gzip', header=None, sep='\t', skiprows=1)
unbiased_train.columns = ['DNA', 'RNA']
frac = unbiased_train['RNA'].apply(lambda x: float(x).is_integer()).sum()/len(unbiased_train) 
print(f"after unbiased downsampling: {frac:.2%}")

for i in range(1,6):
    biased_train = pd.read_csv('/datacommons/igvf-pm/A549/GR-AP1/simulated-seq/data/0.003-positive/train-lognormal10-'+str(i)+'-counts.txt.gz',
                    compression='gzip', header=None, sep='\t', skiprows=1)
    biased_train.columns = ['DNA', 'RNA']
    frac = biased_train['RNA'].apply(lambda x: float(x).is_integer()).sum()/len(biased_train)
    print(f"after biased downsampling by acceptance function lognormal^10, in sample {i}: {frac:.4%}")
print('')
print('models were trained for 30 epochs, no earlystop')

percentage of positive samples in training set:
before downsampling: 0.30%
after unbiased downsampling: 0.30%
after biased downsampling by acceptance function lognormal^10, in sample 1: 1.0734%
after biased downsampling by acceptance function lognormal^10, in sample 2: 1.0730%
after biased downsampling by acceptance function lognormal^10, in sample 3: 1.0728%
after biased downsampling by acceptance function lognormal^10, in sample 4: 1.0725%
after biased downsampling by acceptance function lognormal^10, in sample 5: 1.0726%

models were trained for 30 epochs, no earlystop


In [14]:
biased_train['RNA'].apply(lambda x: float(x).is_integer()).sum()

18247

In [16]:
def loadFasta(fasta_path, as_dict=False,uppercase=False, stop_at=None,
              revcomp=False):
    fastas = []
    seq = None
    header = None
    for r in (gzip.open(fasta_path) if fasta_path.endswith(".gz") else open(fasta_path)):
        if type(r) is bytes: r = r.decode("utf-8")
        r = r.strip()
        if r.startswith(">"):
            if seq != None and header != None:
                fastas.append([header, seq])
                if stop_at != None and len(fastas) >= stop_at:
                    break
            seq = ""
            header = r[1:]
        else:
            if seq != None:
                seq += r.upper() if uppercase else r
            else:
                seq = r.upper() if uppercase else r
    if stop_at != None and len(fastas) < stop_at:
        fastas.append([header, seq])
    elif stop_at == None:
        fastas.append([header, seq])
    if as_dict:
        return {h: s for h, s in fastas}
    if(revcomp):
        for rec in fastas:
            rc=generate_complementary_sequence(rec[1])
            rec[1]=rec[1]+"NNNNNNNNNNNNNNNNNNNN"+rc
    return pd.DataFrame({'location': [e[0].split('=')[-1] for e in fastas],
                         'sequence': [e[1] for e in fastas]})

In [21]:
import gzip
fasta = loadFasta('/datacommons/igvf-pm/A549/GR-AP1/simulated-seq/data/0.003-positive/train-lognormal10-1.fasta.gz')
fasta['location'].nunique()

1700000