In [None]:
import pandas as pd

# Tri-Codon Extraction

In [None]:
# establish hotspot motifs
motifs = ["AGCA","GGCA","TGCA","AGTA","GGTA",
          "TGTA","AGCT","GGCT","TGCT","AGTT",
          "GGTT","TGTT","AACT","TACT","AACC","TACC",
          "AGCC","TGCC","AACA","TACA"]

In [None]:
# read in csv to analyze
df = pd.read_csv('./Copy of IGHV_Human_Hotspot_Pyrimidine Analysis.csv')
df.head()

In [None]:
# note: this chunk uses getCSC() function defined in getCSC_ALL.ipynb

gene_name_list = []
ID_name_list = []
motif_list = []
base_list = []
upstream_codons = []
c = []
t = []
ct = []
start_yn = []
start_y_pos = []
csc = []

#for each row of the csv file
for index, row in df.iterrows():
    gene_name = row['gene name']
    seq = row['sequence']
    ID = row['ID_name']
    
    for base in range(len(seq)-3):
        tetramer = seq[base:base+4]
        if base > 8 and tetramer in motifs:
            gene_name_list.append(gene_name)
            ID_name_list.append(ID)
            motif_list.append(tetramer)
            base_list.append(base)
            
            if base % 3 == 0:
                codon = seq[base-9:base]
                upstream_codons.append(codon)
                c.append(codon.count('C'))
                t.append(codon.count('T'))
                ct.append(codon.count('C')+codon.count('T'))
                start_yn.append("N")
                start_y_pos.append("-")
                csc.append(getCSC(codon,0))
                
            elif base % 3 == 1:
                codon = seq[base-7:base+2]
                upstream_codons.append(codon)
                c.append(codon.count('C'))
                t.append(codon.count('T'))
                ct.append(codon.count('C')+codon.count('T'))
                start_yn.append("Y")
                start_y_pos.append("8")
                csc.append(getCSC(codon,0))
                
            elif base % 3 == 2:
                codon = seq[base-8:base+1]
                upstream_codons.append(codon)
                c.append(codon.count('C'))
                t.append(codon.count('T'))
                ct.append(codon.count('C')+codon.count('T'))
                start_yn.append("Y")
                start_y_pos.append("9")
                csc.append(getCSC(codon,0))

df_new = pd.DataFrame()
df_new["IGHV gene"] = gene_name_list
df_new["ID"] = ID_name_list
df_new["Hotspot DNA"] = motif_list
df_new["start base"] = base_list
df_new["Upstream Codon Triplet"] = upstream_codons
df_new["No. T"] = t
df_new["No. C"] = c
df_new["No. T+C"] = ct
df_new["CSC Avg of Upstream Codon Triplet"] = csc
df_new["Triplet Contains Hotspot Start?"] = start_yn
df_new["If yes, at what position?"] = start_y_pos

df_new.head()

# SHM-Affected Codon Extraction

In [None]:
# read in csv file to analyze (output of random_mutations.ipynb)

df = pd.read_csv('./Random/Tonsil_withCSC_new_seqs_vals_COPY.csv')
df.head()

In [None]:
ighv_list = []
base_list = []

#for each row of the csv file
for index, row in df.iterrows():
    ighv_gene = row['V1']
    germline_seq = row['Germline']
    length = len(germline_seq)
    random_muts = row['RandomVariants']
    
    random_muts_list = random_muts.split(',')
        
    #for each randomly-mutated sequence
    for i in range(len(random_muts_list)):
        mut_seq = random_muts_list[i]
            
        #for each codon over the length of the mutated and germline sequences
        for base in range(0, length, 3):
            #if it's a full codon
            if len(mut_seq[base:base+3]) == 3:
                #initialize codons to compare
                mut_codon = mut_seq[base:base+3]
                germline_codon = germline_seq[base:base+3]

                #if the mutated codon and germline codon match:
                if mut_codon == germline_codon:
                    pass

                #if the 2 codons are different --> mutation:
                else:
                    #store the sequence name and starting base position
                    ighv_list.append(ighv_gene)
                    base_list.append(base)


output = pd.DataFrame()
output["IGHV Gene"] = ighv_list
output["start base"] = base_list
output

# Determine the region in which the affected codon/mutation occurs

In [None]:
# read in regional base cutoff values & convert to dictionary
# values come from IgBlast
vdb = pd.read_csv('./FR1_Based_FR-CDR_Cutoffs_CLEAN_25Oct2024.csv')
vdb.head()

vdb_dict = vdb.set_index('Vgene').T.to_dict('dict')

In [None]:
location = []

#for each row of the resulting dataframe from previous step
for index, row in output.iterrows():
    ighv_gene = row['IGHV gene']
    base = row['start base']
    
    if ighv_gene in vdb_dict.keys():
        current_dict = vdb_dict[ighv_gene]
        if base >= current_dict['FR1-From']-1 and base <= current_dict['FR1-To']-1:
            location.append('FWR1')
        elif base >= current_dict['CDR1-From']-1 and base <= current_dict['CDR1-To']-1:
            location.append('CDR1')
        elif base >= current_dict['FR2-From']-1 and base <= current_dict['FR2-To']-1:
            location.append('FWR2')
        elif base >= current_dict['CDR2-From']-1 and base <= current_dict['CDR2-To']-1:
            location.append('CDR2')
        elif base >= current_dict['FR3-From']-1 and base <= current_dict['FR3-To']-1:
            location.append('FWR3')
        elif base > current_dict['FR3-To']-1:
            location.append('CDR3/FR4')
    
    else:
        location.append(' - ')
        
output["codon subregion"] = location
output.head()

In [None]:
# summary of counts of affected codons/region

loc_df = pd.DataFrame(location)
print(loc_df[0].value_counts())