In [None]:
import pandas as pd
import numpy as np

In [None]:
def preprocess_ThermoFisher_data(mfe_src):
    """handle invalid Vol datatype, drop NA value from 'Mass' and 'Vol' colomns"""
    df = pd.read_excel(mfe_src)
    print('before process', df.shape)
    df['Vol'] = pd.to_numeric(df.Vol, downcast='integer', errors='coerce')
    df = df.dropna(subset=['Mass', 'Vol'])
    print('after process', df.shape)
    mfe_dst = '{}_nona.xlsx'.format(mfe_src.split('.')[0])
    df.to_excel(mfe_dst)

In [None]:
def base_calling_random(df, silence=False, homology=False, acid_labile=False, methyl=False):
    df = df.sort_index()
    mass_x = np.array(df.Mass)
    mass_y = np.array(df.Mass)
    if acid_labile:
        mass_x = np.array(df[df.isAD == False].Mass)
        mass_y = np.array(df[df.isAD == True].Mass)
    if not silence:
        print("mass_x {} mass_y {}".format(mass_x.shape, mass_y.shape))

#     base_dict = {'C': 305.0413, 'A': 329.0525, 'G': 345.0474, 'U': 306.0253, 'g': 359.0631, 'D': 308.041, 'c': 319.0569, 'T': 320.041, 'G^': 373.0787, 'a': 343.0682, 'y': 212.0086, 'Y': 570.1475, 'P': 557.2251, 'x': 688.1156, 'z': 625.0844, 'I': 330.03654, 'O': 344.052}
    base_dict = {'C': 305.0413, 'A': 329.0525, 'G': 345.0474, 'U': 306.0253, 'g': 359.0631, 'D': 308.041, 'c': 319.0569, 'T': 320.041, 'a': 343.0682, 'y': 212.0086, 'mnm5s2U': 365.04466, 'X': 449.08299, 's4U': 322.00246, 'ms2io6A': 459.09776} #
#     base_dict = {'g': 359.0631, 'T': 320.041, 'mnm5s2U': 365.04466, 'X': 449.08299, 's4U': 322.00246, 'ms2io6A': 459.09776} #387.1273+61.9557
    if homology:
        base_dict = {'C (305.04)': 305.0413, 'A (329.05)': 329.0525, 'endA (249.09)': 249.0862, 'endC (225.08)': 225.075, 'Me (14.01)': 14.0106, '2Me (14.01)': 28.0212} #, 'Udiff': 43
#         base_dict = {'C': 305.0413, 'A': 329.0525, 'endA': 249.0862, 'endC': 225.075, 'Ox': 16.0, 'Methyl': 14.0106, '2Methyl': 28.0212} #, 'Udiff': 43
#         base_dict = {'Y (358.16)': 358.1599, 'C (305.04)': 305.0413, 'A (329.05)': 329.0525, 'endA (249.09)': 249.0862, 'endC (225.08)': 225.075, 'Methyl (14.01)': 14.0106}
    if acid_labile:
#         base_dict = {'Y': 358.1599, 'm6t6A': 276.09568, 'Gr(p)': 345.04602, 'cnm5U': 133.0262, 'I': 118.02654, 'g6A': 218.05381, 'o2yW': 390.12737, 'ms2t6A': 308.06775, 'acp3U/cmnm5Um': 195.06298, 'mcmo5U': 182.03135}
        base_dict = {'Y': 358.1599}
    if methyl:
        base_dict = {'Me': 14.0106} #, '2Me': 28.0212
    pairs = list()
    idxs = list()
    PPM = 10
    for k in base_dict.keys():
        base_mass = base_dict.get(k)
        ppm_matrix = np.abs((mass_x[:, np.newaxis] - mass_y - base_mass) * 1E6 / (mass_y + base_mass))
        ppm_df = pd.DataFrame(ppm_matrix)
        crosstalk = ppm_df[ppm_df < PPM]
        idx_pairs = list(crosstalk[crosstalk.notnull()].stack().index)
        if idx_pairs:
            idx_pairs = [(*pair, k) for pair in idx_pairs] # append base name into idx_pairs
            pairs.extend(idx_pairs)
        df3_idxs = [pair[0] for pair in idx_pairs]
        df5_idxs = [pair[1] for pair in idx_pairs]
        df3_idxs = list(set(df3_idxs))
        df5_idxs = list(set(df5_idxs))
#         if not silence:
#             print("df3_idxs {} df5_idxs {}".format(len(df3_idxs), len(df5_idxs)))
        if df3_idxs:
            idxs.extend(df3_idxs)
        if df5_idxs:
            idxs.extend(df5_idxs)
    
    mass_pairs = [(df.Mass.iloc[p[0]], df.Mass.iloc[p[1]], p[2]) for p in pairs]
    if acid_labile:
        mass_pairs = [(df[df.isAD == False].Mass.iloc[p[0]], df[df.isAD == True].Mass.iloc[p[1]], p[2]) for p in pairs]
    mass_pairs = [mp for mp in mass_pairs if mp[0] != mp[1]] # remove item which contains duplicated values
    idxs = list(set(idxs))
#     plt.figure(figsize=(16, 12))
    df_base_calling = df.iloc[idxs]
    if acid_labile:
        masses = [pair[0] for pair in mass_pairs]
        masses_1 = [pair[1] for pair in mass_pairs]
        masses.extend(masses_1)
        masses = list(set(masses))
        df_base_calling = df[df.Mass.isin(masses)]
    return df_base_calling, mass_pairs

In [1]:
def zone_selection(df, on_selection=None, on_click=None):
    import plotly.graph_objects as go
    df_draw = df
    x = df_draw.Mass
    y = df_draw.RT
    fig = go.Scatter(x=x, y=y, mode='markers')
    f = go.FigureWidget(fig)
    f.update_layout(autosize=False, width=1024, height=700, paper_bgcolor="LightSteelBlue",
        margin=dict(
            l=20,
            r=20,
            b=20,
            t=20,
            pad=4
        ))
    scatter = f.data[0]
    colors = ['#6371f2'] * df.shape[0]
    scatter.marker.color = colors
    
    def onclick_callback(trace, points, selector):
        nonlocal scatter
        c = list(scatter.marker.color)
        for i in points.point_inds:
            c[i] = '#87a14a'
            with f.batch_update():
                scatter.marker.color = c
        
        on_click(trace, points, selector)
        
    if on_selection:
        scatter.on_selection(on_selection)
    if on_click:
        scatter.on_click(onclick_callback)
    return f

In [None]:
def match_dfs(df_src, df_dst, ppm=10):
    """
    find the subset contains common Mass values that exist in both df_src and df_dst
    """
    def _find_mass(df, mass, ppm=10):
        if df.empty:
            return df
        df = df[(df.Mass < mass+1) & (df.Mass > mass-1)]
        if df.shape[0] == 0:
            return df
        
        df_ppm = abs(1E6 * (df.Mass - mass)) / mass
        mask = df_ppm < ppm
        df_found = df[mask].copy()
        df_found['PPM'] = df_ppm[mask]
        return df_found
    
    df_src = df_src.copy()
    idxs = list()
    for idx, row in df_src.iterrows():
        mass = row.Mass
        df_res = _find_mass(df_dst, mass, ppm)
        if not df_res.empty:
            df_src.loc[idx, 'Match'] = True
            idxs.extend(list(df_res.index))
    
    idxs = list(set(idxs))
    df_common = df_dst[df_dst.index.isin(idxs)]
    return df_common.copy()