In [11]:
import json
filename = "fastq-run-info-with-extra.json"
with open(filename) as f:
    data = json.load(f)
for item in data:
    drug, _ = item['sample_title'].split('_', 1)
    subject, timepoint = item['host subject id'].split('_', 1)
   
    item['combo_name'] = (drug, subject, timepoint)

In [12]:
combo_to_accession = {  item['combo_name']: item['run_accession'] for item in data }
combo_to_accession


{('MOLNUPIRAVIR', '10', 'T0'): 'ERR10442739',
 ('MOLNUPIRAVIR', '13', 'T0'): 'ERR10442746',
 ('MOLNUPIRAVIR', '8', 'T0'): 'ERR10442747',
 ('MOLNUPIRAVIR', '14', 'T0'): 'ERR10442752',
 ('PAXLOVID', '4', 'T1'): 'ERR10442753',
 ('PAXLOVID', '3', 'T1'): 'ERR10442754',
 ('PAXLOVID', '7', 'T0'): 'ERR10442755',
 ('PAXLOVID', '2', 'T1'): 'ERR10442756',
 ('PAXLOVID', '5', 'T1'): 'ERR10442757',
 ('MOLNUPIRAVIR', '15', 'T1'): 'ERR10442758',
 ('MOLNUPIRAVIR', '11', 'T1'): 'ERR10442759',
 ('MOLNUPIRAVIR', '12', 'T1'): 'ERR10442760',
 ('MOLNUPIRAVIR', '9', 'T1'): 'ERR10442761',
 ('MOLNUPIRAVIR', '10', 'T1'): 'ERR10442762',
 ('MOLNUPIRAVIR', '13', 'T1'): 'ERR10442763',
 ('MOLNUPIRAVIR', '14', 'T1'): 'ERR10442764',
 ('PAXLOVID', '4', 'T0'): 'ERR10442765',
 ('PAXLOVID', '7', 'T2'): 'ERR10442766',
 ('PAXLOVID', '4', 'T2'): 'ERR10442767',
 ('PAXLOVID', '3', 'T2'): 'ERR10442768',
 ('PAXLOVID', '2', 'T2'): 'ERR10442769',
 ('PAXLOVID', '5', 'T2'): 'ERR10442770',
 ('MOLNUPIRAVIR', '15', 'T2'): 'ERR10442771',

In [13]:
all_subjects = set([x[1] for x in combo_to_accession.keys()])

In [29]:
# for each subject, get the T0 timepoint and figure out the starting nuc at each position

def get_ref(subject):
    t0_combo_name = [x for x in combo_to_accession.keys() if x[1] == subject and x[2] == 'T0'][0]
    t0_accession = combo_to_accession[t0_combo_name]
    nucs = ['A', 'C', 'G', 'T']
    from collections import defaultdict
    refs= {}
    # open results_tsv/{accession}.sorted.bam.tsv
    with open("results_tsv/{}.sorted.bam.tsv".format(t0_accession)) as f:
        for line in f:
            int_vals = [int(x) for x in line.split("\t")]
            nuc = int_vals[0]
            nuc_to_count = {nuc: count for nuc, count in zip(nucs, int_vals[1:])}
            sort_descending = sorted(nuc_to_count.items(), key=lambda x: x[1], reverse=True)
            # check that first is more than 5x the second
            if sort_descending[0][1] < 5 * sort_descending[1][1]:
                ref_nuc = 'N'
            elif sort_descending[0][1] < 10:
                ref_nuc = 'N'
            else:
                ref_nuc = sort_descending[0][0]
            refs[nuc] = ref_nuc
    return refs




In [30]:
references = {subject: get_ref(subject) for subject in all_subjects}

In [31]:
references

{'4': {2: 'T',
  3: 'A',
  4: 'A',
  5: 'A',
  6: 'G',
  7: 'G',
  8: 'T',
  9: 'T',
  10: 'T',
  11: 'A',
  12: 'T',
  13: 'A',
  14: 'C',
  15: 'C',
  16: 'T',
  17: 'T',
  18: 'C',
  19: 'C',
  20: 'C',
  21: 'A',
  22: 'G',
  23: 'G',
  24: 'T',
  25: 'A',
  26: 'A',
  27: 'C',
  28: 'A',
  29: 'A',
  30: 'A',
  31: 'C',
  32: 'C',
  33: 'A',
  34: 'A',
  35: 'C',
  36: 'C',
  37: 'A',
  38: 'A',
  39: 'C',
  40: 'T',
  41: 'T',
  42: 'T',
  43: 'C',
  44: 'G',
  45: 'A',
  46: 'T',
  47: 'C',
  48: 'T',
  49: 'C',
  50: 'T',
  51: 'T',
  52: 'G',
  53: 'T',
  54: 'A',
  55: 'G',
  56: 'A',
  57: 'T',
  58: 'C',
  59: 'T',
  60: 'G',
  61: 'T',
  62: 'T',
  63: 'C',
  64: 'T',
  65: 'C',
  66: 'T',
  67: 'A',
  68: 'A',
  69: 'A',
  70: 'C',
  71: 'G',
  72: 'A',
  73: 'A',
  74: 'C',
  75: 'T',
  76: 'T',
  77: 'T',
  78: 'A',
  79: 'A',
  80: 'A',
  81: 'A',
  82: 'T',
  83: 'C',
  84: 'T',
  85: 'G',
  86: 'T',
  87: 'G',
  88: 'T',
  89: 'G',
  90: 'G',
  91: 'C',
  92: 'T',
  

In [40]:
def process(subject):
    non_zero_timepoints = [x for x in combo_to_accession.keys() if x[1] == subject and x[2] != 'T0']
    for combo_name in non_zero_timepoints:
        
        accession = combo_to_accession[combo_name]
        in_file = "results_tsv/{}.sorted.bam.tsv".format(accession)
        out_file = "refs_results/{}.tsv".format("_".join(combo_name))
        with open(in_file) as f, open(out_file, 'w') as g:
            for line in f:
                line = line.strip()
                nuc, *counts = line.split("\t")
                nuc = int(nuc)
                try:
                    ref = references[subject][nuc]
                except KeyError:
                    ref = 'N'
                g.write("{}\t{}\t{}\n".format(nuc, ref, "\t".join(counts)))

for subject in all_subjects:
    process(subject)


In [36]:
all_subjects

{'1',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '17',
 '18',
 '19',
 '2',
 '20',
 '21',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9'}