In [1]:
from Bio import SeqIO
file = "./consensus/aligned.fa"
records = list(SeqIO.parse(file, "fasta"))
records = {r.id: r.seq for r in records}
# make uppercase
records = {k: v.upper() for k, v in records.items()}
ref_file = "ref.fa"
ref = list(SeqIO.parse(ref_file, "fasta"))[0]
ref = ref.seq

In [2]:
import pandas as pd
file = "./sra_metadata.csv"
cols = ["id","biosample","sample_name","drug"]
# no header, usecols to select columns
df = pd.read_csv(file, header=None, usecols=[0,1,2,3], names=cols)
# sample name is study_name/detailed_sample_name/country, split these up
df["study_name"] = df["sample_name"].str.split("/").str[0]
df["detailed_sample_name"] = df["sample_name"].str.split("/").str[1]
df["country"] = df["sample_name"].str.split("/").str[2]

# detailed sample name is individualid_day, split these up
df["individualid"] = df["detailed_sample_name"].str.split("_").str[0]
df["day"] = df["detailed_sample_name"].str.split("_").str[1]

# drop detailed sample name
df = df.drop(columns=["detailed_sample_name"])


In [3]:
def find_mutations(start_seq, end_seq):
    """Find mutations between two sequences. If either sequence has "n" or "-" do not call this a mutation"""
    mutations = []
    to_ref = 0
    not_to_ref = 0
    for i in range(len(start_seq)):
        if start_seq[i] != end_seq[i] and start_seq[i] != "N" and end_seq[i] != "N" and start_seq[i] != "-" and end_seq[i] != "-":
            # get the 3nt centered on the mutation from the reference
            context = ref[i-1:i+2]
            mutations.append((i, start_seq[i], end_seq[i], context))
            # check if is a mutation to reference
            if end_seq[i] == ref[i]:
                to_ref += 1
            else:
                not_to_ref += 1
    if to_ref>0 and to_ref >= not_to_ref:
        print("many mutations to reference, ignoring pair")
        return []
    return mutations

In [7]:
def transition_or_tranvsversion(start,end):
    """Return transition or transversion"""
    if start == "A" and end == "G":
        return "transition"
    if start == "G" and end == "A":
        return "transition"
    if start == "C" and end == "T":
        return "transition"
    if start == "T" and end == "C":
        return "transition"
    return "transversion"

by_individual = df.groupby("individualid")
from collections import Counter, defaultdict
site_counts = Counter()
muts_by_drug = defaultdict(list)

drug_counts = Counter()

for individual, group in by_individual:
    # try to find a d1 seq
    d1 = group[group["day"] == "d1"]
    if len(d1) == 0:
        continue
    # try to find a d5 seq
    d5 = group[group["day"] == "d5"]
    if len(d5) == 0:
        continue

    d3 = group[group["day"] == "d3"]
    # check the sequences are in records, otherwise print error and skip
    if d1["id"].values[0] not in records:

        print("d1 not in records", d1["id"].values[0])
        if d3["id"].values[0] in records:
            d1 = d3
            print("using d3 instead")
        else:
            continue
    if d5["id"].values[0] not in records:
        print("d5 not in records", d5["id"].values[0])
        continue
    drug = d1["drug"].values[0]
    drug_counts[drug] += 1
    # find mutations
    mutations = find_mutations(records[d1["id"].values[0]], records[d5["id"].values[0]])
    # print mutations
    print(individual, mutations, drug)
    for m in mutations:
        site_counts[m[0]] += 1
        muts_by_drug[drug].append(m)

# print the top 10 sites
print(site_counts.most_common(10))
# count the number of each (start,end) pair for each drug, and print in reverse order

#find sites with more than one count
multiple_times = {k for k,v in site_counts.items() if v > 1}
# filter these away from the muts_by_drug
for drug, muts in muts_by_drug.items():
    muts_by_drug[drug] = [m for m in muts if m[0] not in multiple_times]




201 [] molnupiravir
202 [] molnupiravir
203 [] placebo
204 [] placebo
205 [] molnupiravir
206 [] placebo
207 [(13501, 'C', 'T', Seq('CCG'))] molnupiravir
208 [] placebo
209 [] placebo
d5 not in records SRR19915106
211 [(10424, 'G', 'A', Seq('GGT')), (16522, 'C', 'T', Seq('ACA')), (18815, 'G', 'A', Seq('TGT')), (23120, 'C', 'T', Seq('GCA')), (28115, 'G', 'A', Seq('CGA'))] molnupiravir
212 [] placebo
213 [] placebo
214 [] molnupiravir
215 [] molnupiravir
216 [] placebo
217 [] molnupiravir
218 [] placebo
219 [] placebo
220 [(18651, 'C', 'T', Seq('GCG'))] molnupiravir
221 [] molnupiravir
222 [] placebo
d5 not in records SRR19914624
224 [] placebo
225 [] placebo
226 [(1230, 'G', 'T', Seq('AGT')), (1417, 'G', 'T', Seq('TGC')), (12356, 'T', 'C', Seq('ACA')), (14211, 'G', 'A', Seq('TGA')), (15445, 'G', 'A', Seq('TGT')), (20828, 'A', 'T', Seq('TAA')), (28094, 'T', 'A', Seq('TAA')), (28870, 'T', 'G', Seq('AGG'))] molnupiravir
227 [] placebo
228 [(750, 'C', 'T', Seq('ACA')), (1018, 'C', 'T', Seq(

In [25]:


# divide the G->A mutations by their context for each drug
print("\n\TOTALS ANALYSIS")
by_mut_type = {k: Counter([f"{m[1]}>{m[2]}" for m in muts]).most_common() for k, muts in muts_by_drug.items()}
from plotnine import *
import pandas as pd
df = pd.DataFrame()
for drug, muts in by_mut_type.items():
    for mut in muts:
        df = df.append({"drug": drug
                        , "mutation": mut[0]
                        , "count": mut[1]
                          }, ignore_index=True)
# add in the zero values with expand_grid
from itertools import product
df = df.merge(pd.DataFrame(list(product(df["drug"].unique(), df["mutation"].unique())), columns=["drug", "mutation"]), how="outer")
df["count"] = df["count"].fillna(0)






# plot the mutations by drug with bar
p = (ggplot(df, aes(fill="drug", y="count", x="mutation"))
    + geom_bar(stat="identity", position="dodge")
    + theme_classic() + labs(x="Mutation type", y="Times observed", fill = "Drug"))

p.save("mutations_by_drug.pdf", width=5, height=5, units="in", dpi=300)



\TOTALS ANALYSIS




In [None]:

def is_mut_of_interest(start,end, context):
    # g>a in the context of tgt
    if start == "G" and end == "A" and "T" in context:
        return True
    # c>t in the context of aca
    if start == "C" and end == "T" and "A" in context:
        return True
    else:
        return False


for drug, muts in muts_by_drug.items():
    print(drug)
    print(Counter([m[1:3] for m in muts]).most_common())
    print(Counter([m[3] for m in muts]).most_common())
    print(Counter([is_mut_of_interest(m[1],m[2],m[3]) for m in muts]).most_common())


# print the number of samples for each drug
print(drug_counts)

In [15]:
def translate_codon(codon):
    lookup = dict(zip(["TTT", "TTC", "TTA", "TTG", "CTT", "CTC", "CTA", "CTG", "ATT", "ATC", "ATA", "ATG", "GTT", "GTC", "GTA", "GTG", "TCT", "TCC", "TCA", "TCG", "CCT", "CCC", "CCA", "CCG", "ACT", "ACC", "ACA", "ACG", "GCT", "GCC", "GCA", "GCG", "TAT", "TAC", "TAA", "TAG", "CAT", "CAC", "CAA", "CAG", "AAT", "AAC", "AAA", "AAG", "GAT", "GAC", "GAA", "GAG", "TGT", "TGC", "TGA", "TGG", "CGT", "CGC", "CGA", "CGG", "AGT", "AGC", "AGA", "AGG", "GGT", "GGC", "GGA", "GGG"], "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"))
    return lookup[codon]
from collections import Counter, defaultdict
import tqdm
file = "hu1.gb"
from Bio import SeqIO
records = list(SeqIO.parse(file, "genbank"))
main = records[0]
# find CDS features
cds = [f for f in main.features if f.type == "CDS"]
#print(cds)
# for each position in the sequence, find the feature it is in

context_to_type = defaultdict(Counter)

for i in tqdm.tqdm(range(len(main.seq))):
    my_val = main.seq[i]
    one_based = i+1
    this_cds = None
    for f in cds:
        if i >= f.location.start.position and i <= f.location.end.position:
            this_cds = f
            break
    if this_cds:
        offset = i - this_cds.location.start.position
        codon = offset // 3
        codon_string = main.seq[this_cds.location.start.position + codon*3:this_cds.location.start.position + codon*3+3]
        my_position = offset % 3
    else:
        codon = None
        codon_string = None
        my_position = None
    cds_name = this_cds.qualifiers["gene"][0] if this_cds else None
    context = main.seq[i-1:i+2]
    if my_val == "G":
        if codon is None:
            type = "syn"
        else:
            mutated_string= list(codon_string)
            mutated_string[my_position] = "A"
            mutated_string = "".join(mutated_string)
            if translate_codon(mutated_string) == translate_codon(codon_string):
                type = "syn"
            else:
                type = "nonsyn"
        context_to_type[context][type] += 1
    
    #print(i, cds_name, my_val, one_based, codon, codon_string, my_position)

# print proportions for each context

grand_total = 0
for context, counts in context_to_type.items():
    total = sum(counts.values())
    grand_total += total
    print(context, counts["syn"]/total, counts["nonsyn"]/total)

print(grand_total, "total G mutations")
print(len([i for i in main.seq if i == "G"]), "total Gs calculated another way")

total_syn = sum([counts["syn"] for counts in context_to_type.values()])
for context, counts in context_to_type.items():

    print(context, counts["syn"]/total_syn , counts["nonsyn"])

100%|██████████| 29903/29903 [00:00<00:00, 151984.44it/s]


AGG 0.3890577507598784 0.6109422492401215
GGT 0.16079295154185022 0.8392070484581498
CGA 0.3157894736842105 0.6842105263157895
TGT 0.2867132867132867 0.7132867132867133
AGA 0.21818181818181817 0.7818181818181819
TGG 0.17870036101083034 0.8212996389891697
GGC 0.17937219730941703 0.820627802690583
CGG 0.2894736842105263 0.7105263157894737
TGC 0.1809872029250457 0.8190127970749543
AGT 0.35700197238658776 0.6429980276134122
CGC 0.29896907216494845 0.7010309278350515
CGT 0.2807017543859649 0.7192982456140351
TGA 0.18571428571428572 0.8142857142857143
GGA 0.20921985815602837 0.7907801418439716
AGC 0.23920265780730898 0.760797342192691
GGG 0.373134328358209 0.6268656716417911
5863 total G mutations
5863 total Gs calculated another way
AGG 0.08982456140350877 201
GGT 0.051228070175438595 381
CGA 0.021052631578947368 65
TGT 0.1726315789473684 612
AGA 0.09263157894736843 473
TGG 0.06947368421052631 455
GGC 0.028070175438596492 183
CGG 0.015438596491228071 54
TGC 0.06947368421052631 448
AGT 0.127