In [None]:
import pandas as pd
import numpy as np

In [1]:
def base_calling_random(df, silence=False, homology=False, acid_labile=False, methyl=False, base_only=False):
    df = df.sort_index()
    mass_x = np.array(df.Mass)
    mass_y = np.array(df.Mass)
    if acid_labile:
        mass_x = np.array(df[df.isAD == False].Mass)
        mass_y = np.array(df[df.isAD == True].Mass)
    if not silence:
        print("mass_x {} mass_y {}".format(mass_x.shape, mass_y.shape))

#     base_dict = {'C': 305.0413, 'A': 329.0525, 'G': 345.0474, 'U': 306.0253, 'g': 359.0631, 'D': 308.041, 'c': 319.0569, 'u': 320.041, 'G^': 373.0787, 'a': 343.0682, 'y': 212.0086, 'Y': 570.1475, 'P': 557.2251, 'x': 688.1156, 'z': 625.0844, 'I': 330.03654, 'O': 344.052}
    base_dict = {'C': 305.0413, 'A': 329.0525, 'G': 345.0474, 'U': 306.0253, 
                 'mG': 359.0631, 'D': 308.041, 'mC': 319.0569, 
                 'mU': 320.041, 'mA': 343.0682, 'm22C': 333.0625,
                'mcm5U': 378.0464, 'Ψ*': 557.2251} #, 'y': 212.0086, 'mnm5s2U': 365.04466, 'X': 449.08299, 's4U': 322.00246, 'ms2io6A': 459.09776
#     base_dict = {'g': 359.0631, 'u': 320.041, 'mnm5s2U': 365.04466, 'X': 449.08299, 's4U': 322.00246, 'ms2io6A': 459.09776} #387.1273+61.9557
    if homology:
#         base_dict = {'Me': 14.0106} 
#         base_dict = {'C (305.04)': 305.0413, 'A (329.05)': 329.0525, 'endA (249.09)': 249.0862, 'endC (225.08)': 225.075, 'Me (14.01)': 14.0106, '2Me (14.01)': 28.0212} #, 'Udiff': 43
        base_dict = {'C': 305.0413, 'A': 329.0525, 'Ai': 249.0862, 'Ci': 225.075, 'Ox': 16.0, 'Methyl': 14.0106, '2Methyl': 28.0212} #, 'Udiff': 43
#         base_dict = {'Y (358.16)': 358.1599, 'C (305.04)': 305.0413, 'A (329.05)': 329.0525, 'endA (249.09)': 249.0862, 'endC (225.08)': 225.075, 'Methyl (14.01)': 14.0106}
    if acid_labile:
#         base_dict = {'Y': 358.1599, 'm6t6A': 276.09568, 'Gr(p)': 345.04602, 'cnm5U': 133.0262, 'I': 118.02654, 'g6A': 218.05381, 'o2yW': 390.12737, 'ms2t6A': 308.06775, 'acp3U/cmnm5Um': 195.06298, 'mcmo5U': 182.03135}
        base_dict = {'Y': 358.1599}
    if methyl:
        base_dict = {'Me': 14.0106} #, '2Me': 28.0212
    if base_only:
        base_dict = {'C': 305.0413, 'A': 329.0525, 'G': 345.0474, 'U': 306.0253}
    pairs = list()
    idxs = list()
    PPM = 10
    for k in base_dict.keys():
        base_mass = base_dict.get(k)
        ppm_matrix = np.abs((mass_x[:, np.newaxis] - mass_y - base_mass) * 1E6 / (mass_y + base_mass))
        ppm_df = pd.DataFrame(ppm_matrix)
        crosstalk = ppm_df[ppm_df < PPM]
        idx_pairs = list(crosstalk[crosstalk.notnull()].stack().index)
        if idx_pairs:
            idx_pairs = [(*pair, k) for pair in idx_pairs] # append base name into idx_pairs
            pairs.extend(idx_pairs)
        df3_idxs = [pair[0] for pair in idx_pairs]
        df5_idxs = [pair[1] for pair in idx_pairs]
        df3_idxs = list(set(df3_idxs))
        df5_idxs = list(set(df5_idxs))
#         if not silence:
#             print("df3_idxs {} df5_idxs {}".format(len(df3_idxs), len(df5_idxs)))
        if df3_idxs:
            idxs.extend(df3_idxs)
        if df5_idxs:
            idxs.extend(df5_idxs)
    
    mass_pairs = [(df.Mass.iloc[p[0]], df.Mass.iloc[p[1]], p[2]) for p in pairs]
    if acid_labile:
        mass_pairs = [(df[df.isAD == False].Mass.iloc[p[0]], df[df.isAD == True].Mass.iloc[p[1]], p[2]) for p in pairs]
    mass_pairs = [mp for mp in mass_pairs if mp[0] != mp[1]] # remove item which contains duplicated values
    idxs = list(set(idxs))
#     plt.figure(figsize=(16, 12))
    df_base_calling = df.iloc[idxs]
    if acid_labile:
        masses = [pair[0] for pair in mass_pairs]
        masses_1 = [pair[1] for pair in mass_pairs]
        masses.extend(masses_1)
        masses = list(set(masses))
        df_base_calling = df[df.Mass.isin(masses)]
    return df_base_calling, mass_pairs

In [1]:
def basecalling_groups(df, homology=False):
    """ given ~25k area dots, doing homology search
        return a list of group, each group represents a tRNA specy and its isoforms 
    """
    def create_graph(bcr):
        G = nx.Graph()
        for x in bcr[1]:
            G.add_edge(x[0], x[1], name=x[2])

        cc = list(nx.connected_components(G))
        subgraphs = [G.subgraph(x) for x in cc]
        subgraphs.sort(key=len, reverse=True)
        return subgraphs
    def create_graph3(bcr):
        G = nx.Graph()
        for x in bcr[1]:
            G.add_edge(x[0], x[1], name=x[2])

        cc = list(nx.connected_components(G))
        subgraphs = [G.subgraph(x) for x in cc]
        subgraphs.sort(key=len, reverse=True)
        subgraphs.sort(lambda subgraph: 
                       len([node for node in subgraph if node]), 
                       reverse=True)
        return subgraphs
    def create_graph2(bcr):
        G = nx.DiGraph()
        for x in bcr[1]:
            small = min(x[:2])
            big = max(x[:2])
            G.add_edge(small, big, name=x[2])

        cc = list(nx.connected_components(G))
        subgraphs = [G.subgraph(x) for x in cc]
#         subgraphs.sort(key=len, reverse=True)
        subgraphs.sort(lambda subgraph: 
                       len([node for node in subgraph if node.in_degree>0 and node.out_degree>0]), 
                       reverse=True)
        return subgraphs
    
    bcr = base_calling_random(df, homology=homology)
    subgraphs = create_graph3(bcr)
    subgraphs_edges = [ [(x[0], x[1], x[2].get('name')) for x in sg.edges(data=True)] for sg in subgraphs]
    subgraphs_edges = [sorted(sg_edges, key=lambda x: x[0]) for sg_edges in subgraphs_edges]
    subgraphs_nodes = [list(sg.nodes) for sg in subgraphs]
    
    node_edge_pairs = list()
    for group in range(len(subgraphs)):
        df_nodes = df[df.Mass.isin(subgraphs_nodes[group])]
        edges = subgraphs_edges[group]
        node_edge_pairs.append((df_nodes, edges))
    
    return node_edge_pairs

In [1]:
def zone_selection(df, on_selection=None, on_click=None):
    import plotly.graph_objects as go
    df_draw = df
    x = df_draw.Mass
    y = df_draw.RT
    fig = go.Scatter(x=x, y=y, mode='markers')
    f = go.FigureWidget(fig)
#     f.update_layout(autosize=False, width=1024, height=700, paper_bgcolor="LightSteelBlue",
#         margin=dict(
#             l=20,
#             r=20,
#             b=20,
#             t=20,
#             pad=4
#         ))
    scatter = f.data[0]
    colors = ['#6371f2'] * df.shape[0]
    scatter.marker.color = colors
    
    def onclick_callback(trace, points, selector):
        nonlocal scatter
        c = list(scatter.marker.color)
        for i in points.point_inds:
            c[i] = '#87a14a'
            with f.batch_update():
                scatter.marker.color = c
        
        on_click(trace, points, selector)
        
    if on_selection:
        scatter.on_selection(on_selection)
    if on_click:
        scatter.on_click(onclick_callback)
    return f

In [3]:
def match_dfs(df_src, df_dst, ppm=10, shift=0):
    """Find the subset contains common mass values that exist in both df_src and df_dst.
    
    :return: The subset of df_dst.
    """
    def _find_mass(df, mass, ppm=10):
        if df.empty:
            return df
        df = df[(df.Mass < mass+1) & (df.Mass > mass-1)]
        if df.shape[0] == 0:
            return df
        
        df_ppm = abs(1E6 * (df.Mass - mass)) / mass
        mask = df_ppm < ppm
        df_found = df[mask].copy()
        df_found['PPM'] = df_ppm[mask]
        return df_found
    
    df_src = df_src.copy()
    if shift != 0:
        df_src.Mass += shift
    idxs = list()
    for idx, row in df_src.iterrows():
        mass = row.Mass
        df_res = _find_mass(df_dst, mass, ppm)
        if not df_res.empty:
            df_src.loc[idx, 'Match'] = True
            idxs.extend(list(df_res.index))
    
    idxs = list(set(idxs))
    df_common = df_dst[df_dst.index.isin(idxs)]
    return df_common.copy()

def peer_dfs(df_src, df_dst, ppm=10, shift=0):
    dfm_shift_l = match_dfs(df_dst, df_src, ppm, -1*shift)
    dfm_shift_r = match_dfs(df_src, df_dst, ppm, shift)
    return dfm_shift_l, dfm_shift_r
    
def comm_dfs(df_src, df_dst, ppm=10, shift=0):
    """Find two subsets contains common mass values.
    
    :return: Subsets from df_src and df_dst respectively.
    """
    dfm_comm_dst = match_dfs(df_src, df_dst, ppm, shift)
    dfm_comm_src = match_dfs(df_dst, df_src, ppm, shift)
    return dfm_comm_src, dfm_comm_dst

def diff_dfs(df_src, df_dst, ppm=10, shift=0):
    """Find two subsets excluded common mass values.
    
    :return: Subsets from df_src and df_dst respectively.
    """
    df_comm_src, df_comm_dst = comm_dfs(df_src, df_dst, ppm, shift)
    df_only_src = df_src.drop(df_comm_src.index)
    df_only_dst = df_dst.drop(df_comm_dst.index)
    return df_only_src, df_only_dst

In [5]:
def remove_salts(df_sample, salts=[21.9819, 37.9558, 34.9694, 27.9949, 56.92]):
    """ remove salts from dataframe df_sample, returns df and df_salts
    """
    if df_sample.empty:
        return pd.DataFrame(), pd.DataFrame()
    
    df = df_sample.copy()
    df_salts = list()
    for salt in salts:
        df_salt = match_dfs(df, df, shift=salt)
        df_salts.append(df_salt)
    df_salts = pd.concat(df_salts).drop_duplicates()
    df = df.drop(df_salts.index)
    return df, df_salts

def combinated_salts(limit=35, salts=[21.9819, 37.9558]):
    """return all the combinations of salts
    """
    from itertools import product, filterfalse
    
    def filter_product(i):
#         count = len([item for item in i if item <= 1])
#         cond1 = i[2]>2 or i[0]>4 or i[1] > 4 or sum(i[:2])<i[2]
        cond1 = len([item for item in i if item>0])>2
        cond2 = sum(i) > limit
        return any([cond1, cond2])
    
    possibilities = product(range(limit+1), repeat=len(salts))
    possibilities_restricted = filterfalse(filter_product, possibilities)
    
    merge_salts = list()
    for possibility in possibilities_restricted:
        salt = np.dot(possibility, salts)
        merge_salts.append(salt)
    
#     merge_salts = [item for item in merge_salts if item > 0]
    return merge_salts


def combinated_salts_v2(limit=35, salts=[21.9819, 37.9558]):
    """return all the combinations of salts
    """
    from itertools import product, filterfalse
    
    def filter_product(i):
#         count = len([item for item in i if item <= 1])
#         cond1 = i[2]>2 or i[0]>4 or i[1] > 4 or sum(i[:2])<i[2]
        cond1 = len([item for item in i if item>0])>3
        cond2 = sum(i) > limit
        return any([cond1, cond2])
    
    possibilities = product(range(limit+1), repeat=len(salts))
    possibilities_restricted = filterfalse(filter_product, possibilities)
    
    merge_salts = list()
    for possibility in possibilities_restricted:
        salt = np.dot(possibility, salts)
        merge_salts.append((salt, possibility))
    
#     merge_salts = [item for item in merge_salts if item > 0]
    return merge_salts
    
def remove_combinated_salts(df_sample, limit=35, salts=[21.9819, 37.9558]):
    """remove salts combinations, returns df and df_salts
    """
    merge_salts = combinated_salts(limit, salts)
    return remove_salts(df_sample, merge_salts)

# Na 21.9819, K 37.9558, Cl 35.9694, FA 46.0060, CO 27.9949, H2O 18.0106
# NaCl 57.9513, KCl 73.9252  , 57.9513, 73.9252
def detect_combined_salts(df, mass, limit=10, ppm=10, salts=[], amino=0):
    """detect salts(combined or not) from df, return a DataFrame contains salts
    """
    if not salts:
        salts = [21.9819, 37.9558]
        salt_names = ['Na', 'K', 'NaCl', 'KCl']
    merge_salts_v2 = combinated_salts_v2(limit, salts)
#     merge_salts = [i[0] for i in merge_salts_v2l]
    merge_salts_v2.sort(key=lambda i:i[0], reverse=False)
#     merge_salts.sort(reverse=True)
    
    if df.empty:
        return pd.DataFrame()
    
    df_target = pd.DataFrame(columns=['Mass', 'RT', 'Vol'], index=[1])
    df_target['Mass'] = mass
    
    df_salts = list()
    for salt, composit in merge_salts_v2:
        df_salt = match_dfs(df_target, df, shift=salt, ppm=ppm)
        if not df_salt.empty:
            df_salt['AdductsMass'] = salt
            composition = ''
            for idx, i in enumerate(composit):
                if i > 0:
                    composition += f'{i}{salt_names[idx]} '
            df_salt['composit'] = composition
            df_salts.append(df_salt)
    if not df_salts:
        print('df_salts empty')
        return pd.DataFrame(columns=['Mass', 'RT', 'Vol'])
    df_salts = pd.concat(df_salts).drop_duplicates()
    
    if amino > 0:
        rounded_mass = int(mass- amino + 18.0106)
    else:
        rounded_mass = int(mass)
    col = '{}'.format(rounded_mass)
    col_composit = '{}_composition'.format(rounded_mass)
    df.loc[df_salts.index, col] = abs(1E6 * (df_salts.Mass - df_salts.AdductsMass - mass)) / mass
    df.loc[df_salts.index, col_composit] = df_salts.composit
    return df_salts

In [None]:
def local_top(df_src, win=320, step=100, top=3):
    """Using a sliding window to find the top dots.
    Param win: Window size, default 320 Da.
    Param step: step size, default 100 Da.
    Param top: the number of dots, those have the top Volumes of the window
    """
#     df_chosen = list()
#     low, high = df_src.Mass.min(), df_src.Mass.max()
#     if high - low < step:
#         return pd.DataFrame()
    
#     cur = low
#     while high - cur > step:
#         cur += step
#         df = df_src[(df_src.Mass>=cur)&(df_src.Mass<cur+win)]
#         df_top = df.sort_values('Vol', ascending=False).iloc[:top]
#         df_chosen.append(df_top)
    
#     df_chosen = pd.concat(df_chosen).drop_duplicates()
#     return df_chosen
    return local_top_range(df_src, win, step, range=(0, top))

def local_top_range(df_src, win=320, step=100, range=(0,3)):
    """Using a sliding window to find the top dots.
    Param win: Window size, default 320 Da.
    Param step: step size, default 100 Da.
    Param range: the range of dots, those have the top Volumes of the window
    """
    df_chosen = list()
    low, high = df_src.Mass.min(), df_src.Mass.max()
    if high - low < step:
        return pd.DataFrame()
    
    cur = low
    while high - cur > step:
        cur += step
        df = df_src[(df_src.Mass>=cur)&(df_src.Mass<cur+win)]
        df_top = df.sort_values('Vol', ascending=False).iloc[range[0]:range[1]]
        df_chosen.append(df_top)
    
    df_chosen = pd.concat(df_chosen).drop_duplicates()
    return df_chosen