In [None]:
import os
import pandas as pd

In [None]:
%run ../modules/utils.ipynb
%run ../modules/homology_search.ipynb
%run ../modules/ladder_complementation.ipynb
%run ../modules/mass_sum.ipynb

# Control

In [None]:
path = '../samples/total_tRNA/Control/'
dir_list = os.listdir(path)
control_files = [path+f for f in dir_list]
control_files.sort()

dfs_ctrl = [load_data(f) for f in control_files]
dfs_ctrl = [dftmp[dftmp.Mass>23000] for dftmp in dfs_ctrl]
max_rt = max([dftmp.RT.max() for dftmp in dfs_ctrl])
max_vol = max([dftmp.Vol.max() for dftmp in dfs_ctrl])
dfs_updated = list()
for dftmp in dfs_ctrl:
    dftmp.RT *= max_rt/dftmp.RT.max()
    dftmp.Vol *= max_vol/dftmp.Vol.max()
    dfs_updated.append(dftmp)
df_ctrl = pd.concat(dfs_updated)
df_ctrl = drop_dups(df_ctrl)
df_ctrl.shape

In [None]:
plotly_zone(df_ctrl, y='Vol')

In [None]:
f = '../statics/total_tRNA_seqs_base.xlsx'
df_seqbase = pd.read_excel(f)
df_seqbase = df_seqbase.rename(columns={'Monoisotopic Mass (full length)': 'Mass1', 
                                        'Monoisotopic Mass (CC-tailed)': 'Mass2', 
                                        'Monoisotopic Mass (C-tailed)': 'Mass3',
                                        'AminoAcid & CCA': 'Mass4',
                                        'AminoAcid & CC': 'Mass5'
                                       })
df_seqbase.head()

In [None]:
# dftmp.iloc[:, 3:9]
dfms = list()
dftmps = list()
df_sample = df_ctrl
for i in [3, 4, 5, 7]: # 
    dftmp = df_seqbase.copy()
    dftmp['Mass'] = dftmp.iloc[:, i]
    dfm = match_dfs(dftmp, df_sample, ppm=50, inplace=True)
    dftmps.append(dftmp)
    dfm = dfm.sort_values('Mass')
    dfms.append(dfm.iloc[:, :-1])

df_dfms = pd.concat(dfms)
dfms_unique = drop_dups(df_dfms)
df_sample.shape, df_dfms.shape, dfms_unique.shape

In [None]:
plotly_zones(df_ctrl, dfms_unique, y='Vol')
df_ctrl.shape, dfms_unique.shape

In [None]:
figsize=(5, 3)
fig = plt.figure(figsize=figsize)

dfa, dfb = df_ctrl.copy(), dfms_unique.copy()
vol = dfa.Vol.sum()
# dfa['Relative Intensity'] = dfa.Vol/vol
# dfb['Relative Intensity'] = dfb.Vol/vol
plot_zones(dfa, dfb, y='Vol', ylabel='Intensity', figsize=figsize)
# plot_zones(dfa, dfb, y='Relative Intensity', ylabel='Relative Intensity', figsize=figsize)

fig.tight_layout()
svg_fpath = '../outputs/matched_ctrl.svg'
plt.savefig(svg_fpath, dpi=300, transparent=True)

In [None]:
f = '../statics/total_tRNA_seqs_base.xlsx'
df_seqbase_cca = pd.read_excel(f, sheet_name='CCA')
df_seqbase_cc = pd.read_excel(f, sheet_name='CC')
df_seqbase_c = pd.read_excel(f, sheet_name='C')

In [None]:
for s in [2*M, M, Na, K, -2*M, -1*M, 0]:
    dfm1 = match_dfs_v2(df_seqbase_c, df_ctrl, shift=s, inplace=True, ppm=30, copy_cols=['tRNA'])
    idx = df_ctrl[df_ctrl.Match].index
    df_ctrl.loc[idx, 'Isoform'] = 'C'
    dfm2 = match_dfs_v2(df_seqbase_cca, df_ctrl, shift=s, inplace=True, ppm=30, copy_cols=['tRNA'])
    idx = df_ctrl[df_ctrl.Match].index
    df_ctrl.loc[idx, 'Isoform'] = 'CCA'
    dfm3 = match_dfs_v2(df_seqbase_cc, df_ctrl, shift=s, inplace=True, ppm=30, copy_cols=['tRNA'])
    idx = df_ctrl[df_ctrl.Match].index
    df_ctrl.loc[idx, 'Isoform'] = 'CC'
    dfm1.shape, dfm2.shape, dfm3.shape

In [None]:
dft = df_ctrl.copy()
dft = dft.sort_values('tRNA')
dft = dft.reset_index(drop=True)
dft.index += 1
dft[['Mass', 'RT', 'Vol', 'MatchedMass', 'tRNA', 'PPM', 'Isoform', 'Isoform', 'Sft']].to_excel('../outputs/matched_ctrl.xlsx')

In [None]:
df_matched = df_ctrl[df_ctrl.MatchedMass > 0].copy()
df_matched.head()

In [None]:

dft = dfms_unique.copy()
bcr = homology_search(df_matched, names=['Me'], ppm=10)

figsize=(5, 3)
fig = plt.figure(figsize=figsize)

plot_basecalling(bcr[0], bcr[1], y='Vol', annotate=False, ylabel='Intensity', plt=plt, figsize=figsize)
plotly_basecalling(*bcr, y='Vol')
fig.tight_layout()
svg_fpath = '../outputs/homology.svg'
plt.savefig(svg_fpath, dpi=300, transparent=True)

# Acid-Deg

In [None]:
path = '../samples/total_tRNA/AcidDeg/'
dir_list = os.listdir(path)
ad_files = [path+f for f in dir_list]
ad_files.sort()

dfs = [load_data(f) for f in ad_files]
df_ad = pd.concat(dfs)
df_ad.shape

In [None]:
def load_tRNA_library(f_lib='../statics/total_tRNA_mass_ladders.xlsx'):
    df_lib = pd.read_excel(f_lib, skiprows=1)
    return df_lib


def match_library_extended_ext(df, df_lib, idx, ori=5, ppm=30, col_shifts={}):
    """col_shifts should be like {<idx>: [(K, 'K'), (Na, 'Na'), {H2O, 'Dehydration'}]}, 
    while <idx> is the index of cols ['5prim_df', '3prim_df', '3prim_CCtailed_df', '3prim_Ctailed_df']
    """
    dfms = list()
    row = df_lib.loc[idx]
    cols = ['5prim_df', '3prim_df', '3prim_CCtailed_df', '3prim_Ctailed_df']
    for col_idx, col in enumerate(cols):
        df_ladder = pd.read_csv(StringIO(row[col]), sep="\t")
        shifts = [0]
        ext_shifts = col_shifts.get(col_idx)
        if ext_shifts:
            s = [i[0] for i in ext_shifts]
            shifts.extend(s)
        dfm_grp = [match_dfs(df_ladder, df, ppm=ppm, shift=shift) for shift in shifts]
        ldl_names = ['Regular']
        if ext_shifts:
            ldl_names.extend([i[1] for i in ext_shifts])
        dfm_grp = list(zip(dfm_grp, ldl_names))
        dfms.append(dfm_grp)
    return dfms

In [None]:
from io import StringIO

rates = list()
df_lib = load_tRNA_library()
for idx, row in df_lib.iterrows():
    row = df_lib.loc[idx]
    data_row = [c for c in row[1:93] if c != '-']
    
    tRNA_type, *data_row = data_row
    rows = [data_row]
    
    dfm_grps = match_library_extended_ext(df_ad, df_lib, idx) 
    combined_sites = list()
    for grp_id, dfm_grp in enumerate(dfm_grps): # [5´, 3´, 3´-CC, 3´-C]
        for dfm_idx, (dfm, dfm_label) in enumerate(dfm_grp):
            dfm_tmp = dfm.copy()
            dfm_tmp = drop_dups(dfm_tmp)
            if grp_id == 1:
                dfm_tmp.Mass = row['MassCCA'] - dfm_tmp.Mass
            if grp_id == 2:
                dfm_tmp.Mass = row['MassCC'] - dfm_tmp.Mass
            if grp_id == 3:
                dfm_tmp.Mass = row['MassC'] - dfm_tmp.Mass
            sites = dfm_tmp.Mass//320
            combined_sites.extend(sites)
            break
    
    depth_sites = combined_sites.copy()
    combined_sites = set(combined_sites)
    combined_sites = [i for i in combined_sites if i > 0]
    combined_rate = len(combined_sites)/len(data_row)
    
    #  Ladder Coverage
    ladder_hits = [len(dfm_grp[0][0]) for dfm_grp in dfm_grps]
    ladder_hits_rate = [h/len(data_row) for h in ladder_hits]
    pie_rates = [[1-r, r] for r in ladder_hits_rate]
    print(idx+1, tRNA_type, f"{combined_rate:.2f}", combined_sites)