In [72]:
# open each file in "results"

path = "bam_tsv"
import os
files = os.listdir(path)
files = [os.path.join(path, f) for f in files]

rows = []

for file in files:
    handle = open(file, "rt")
    id = file.split("/")[-1].split(".")[0]
    for line in handle:
        columns = line.strip().split("\t")
        pos = int(columns[0])
        counts = [int(c) for c in columns[1:]]
        id = file.split("/")[-1].split(".")[0]
        row = {"id": id, "pos": pos, "A": counts[0], "C": counts[1], "G": counts[2], "T": counts[3]}
        rows.append(row)

import pandas as pd
df = pd.DataFrame(rows)

from Bio import SeqIO
ref = SeqIO.read("ref.fa", "fasta")
ref_string = str(ref.seq)

def get_context(pos):
    return ref_string[pos-1:pos+2]

df["context"] = df["pos"].apply(get_context)

df


Unnamed: 0,id,pos,A,C,G,T,context
0,SRR19914810,1881,204,0,1186,0,AGG
1,SRR19914810,29274,2451,0,0,317,CAG
2,SRR19914810,29275,310,0,2470,1,AGG
3,SRR19914827,1881,187,0,1600,0,AGG
4,SRR19914827,26490,0,427,0,2404,TTA
...,...,...,...,...,...,...,...
2136,SRR19914738,29274,2616,0,2,397,CAG
2137,SRR19914738,29275,387,0,2635,1,AGG
2138,SRR19914756,6793,378,2442,0,0,GCT
2139,SRR19914761,4783,0,207,0,41,ACT


In [73]:
file = "./sra_metadata.csv"
cols = ["id","biosample","sample_name","drug"]
# no header, usecols to select columns
meta = pd.read_csv(file, header=None, usecols=[0,1,2,3], names=cols)
# sample name is study_name/detailed_sample_name/country, split these up
meta["study_name"] = meta["sample_name"].str.split("/").str[0]
meta["detailed_sample_name"] = meta["sample_name"].str.split("/").str[1]
meta["country"] = meta["sample_name"].str.split("/").str[2]

# detailed sample name is individualid_day, split these up
meta["individualid"] = meta["detailed_sample_name"].str.split("_").str[0]
meta["day"] = meta["detailed_sample_name"].str.split("_").str[1]

# drop detailed sample name
meta = meta.drop(columns=["detailed_sample_name"])

df = df.merge(meta, on="id")

In [74]:

# find positions that occur more than once and remove them

df.groupby("pos").filter(lambda x: len(x) <= 2)

def create_string_from_row(row):
    total = sum(row[["A", "C", "G", "T"]])
    ratioed = row[["A", "C", "G", "T"]]/total
    # print any bases that are > 0.1
    bases = ["A", "C", "G", "T"]
    bases = [b for b in bases if ratioed[b] > 0.1]
    return "".join(bases)

df["bases"] = df.apply(create_string_from_row, axis=1)



        

In [75]:
d5 = df[df['day']=='d5']
# aggregate "bases" by "drug"
d5.groupby("drug")["bases"].value_counts()
# subset to d5 where bases = "AG" and second character of context is "G"
selection = d5[(d5["day"] == "d5") & (d5["bases"] == "AG") & (d5["context"].str[1] == "G")]
selection['context_starts_with_T'] = selection['context'].str[0] == 'T'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selection['context_starts_with_T'] = selection['context'].str[0] == 'T'


In [76]:
d5

Unnamed: 0,id,pos,A,C,G,T,context,biosample,sample_name,drug,study_name,country,individualid,day,bases
7,SRR19914991,1881,386,1,2812,6,AGG,SAMN29453041,AGL-CST2/213_d5/UK,placebo,AGL-CST2,UK,213,d5,AG
8,SRR19914991,18633,0,0,4050,541,TGT,SAMN29453041,AGL-CST2/213_d5/UK,placebo,AGL-CST2,UK,213,d5,GT
9,SRR19914991,25405,9,2,1177,6709,ATG,SAMN29453041,AGL-CST2/213_d5/UK,placebo,AGL-CST2,UK,213,d5,GT
10,SRR19914849,26941,0,183,0,227,TCG,SAMN29453035,AGL-CST2/211_d5/UK,molnupiravir,AGL-CST2,UK,211,d5,CT
20,SRR19915047,240,0,18,0,107,TCG,SAMN29453416,AGL-CST2/339_d5/UK,placebo,AGL-CST2,UK,339,d5,CT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2133,SRR19914738,19365,0,278,0,31,ACC,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,CT
2134,SRR19914738,22897,401,0,3094,0,TGG,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AG
2135,SRR19914738,25686,676,0,4204,0,TGC,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AG
2136,SRR19914738,29274,2616,0,2,397,CAG,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AT


In [91]:
values = selection.groupby("drug")["context_starts_with_T"].value_counts() 
proportions  = values.groupby(level=0).apply(lambda x: x / float(x.sum()))
proportions

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  proportions  = values.groupby(level=0).apply(lambda x: x / float(x.sum()))


drug          context_starts_with_T
molnupiravir  True                     0.546584
              False                    0.453416
placebo       False                    0.800000
              True                     0.200000
Name: context_starts_with_T, dtype: float64

In [78]:
selection

Unnamed: 0,id,pos,A,C,G,T,context,biosample,sample_name,drug,study_name,country,individualid,day,bases,context_starts_with_T
7,SRR19914991,1881,386,1,2812,6,AGG,SAMN29453041,AGL-CST2/213_d5/UK,placebo,AGL-CST2,UK,213,d5,AG,False
21,SRR19915047,1881,119,0,516,0,AGG,SAMN29453416,AGL-CST2/339_d5/UK,placebo,AGL-CST2,UK,339,d5,AG,False
23,SRR19915047,29275,165,0,1360,1,AGG,SAMN29453416,AGL-CST2/339_d5/UK,placebo,AGL-CST2,UK,339,d5,AG,False
33,SRR19914674,409,25,0,114,0,TGG,SAMN29453368,AGL-CST2/323_d5/UK,molnupiravir,AGL-CST2,UK,323,d5,AG,True
34,SRR19914674,532,269,0,2072,1,AGC,SAMN29453368,AGL-CST2/323_d5/UK,molnupiravir,AGL-CST2,UK,323,d5,AG,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2127,SRR19914738,8218,48,0,389,0,TGT,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AG,True
2132,SRR19914738,18987,1343,0,5836,0,TGT,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AG,True
2134,SRR19914738,22897,401,0,3094,0,TGG,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AG,True
2135,SRR19914738,25686,676,0,4204,0,TGC,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AG,True


In [89]:
from collections import Counter
context_counts = Counter()
for i, c in enumerate(ref_string):
    if c == "G":
        context = ref_string[i-1:i+2]
        context_counts[context] += 1

In [83]:
selection

Unnamed: 0,id,pos,A,C,G,T,context,biosample,sample_name,drug,study_name,country,individualid,day,bases,context_starts_with_T
7,SRR19914991,1881,386,1,2812,6,AGG,SAMN29453041,AGL-CST2/213_d5/UK,placebo,AGL-CST2,UK,213,d5,AG,False
21,SRR19915047,1881,119,0,516,0,AGG,SAMN29453416,AGL-CST2/339_d5/UK,placebo,AGL-CST2,UK,339,d5,AG,False
23,SRR19915047,29275,165,0,1360,1,AGG,SAMN29453416,AGL-CST2/339_d5/UK,placebo,AGL-CST2,UK,339,d5,AG,False
33,SRR19914674,409,25,0,114,0,TGG,SAMN29453368,AGL-CST2/323_d5/UK,molnupiravir,AGL-CST2,UK,323,d5,AG,True
34,SRR19914674,532,269,0,2072,1,AGC,SAMN29453368,AGL-CST2/323_d5/UK,molnupiravir,AGL-CST2,UK,323,d5,AG,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2127,SRR19914738,8218,48,0,389,0,TGT,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AG,True
2132,SRR19914738,18987,1343,0,5836,0,TGT,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AG,True
2134,SRR19914738,22897,401,0,3094,0,TGG,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AG,True
2135,SRR19914738,25686,676,0,4204,0,TGC,SAMN29453008,AGL-CST2/202_d5/UK,molnupiravir,AGL-CST2,UK,202,d5,AG,True


In [90]:
# get the proportion of each context in descending order
context_counts = {k: v/sum(context_counts.values()) for k, v in context_counts.items()}
context_counts = sorted(context_counts.items(), key=lambda x: x[1], reverse=True)
context_counts

starts_with_t = sum([context_counts[i][1] for i in range(len(context_counts)) if context_counts[i][0][0] == "T"])
starts_with_t



0.44158280743646594