In [1]:
import pandas as pd

In [None]:
def homology_search(df):
    """Doing Homology Search for dataset df.
    
    :param df: pandas DataFrame, contains 'Mass' column.
    :return: tuple (basecallings_dataframe, basecallings)
    """
    df_intact = df[df.Mass>23000]
    return base_calling_random(df_intact, homology=True)

In [None]:
def base_calling_random(df, homology=False, acid_labile=False, methyl=False):
    """Doing basecalling on entire dataframe.
    
    :param df: pandas DataFrame, contains at least column 'Mass'.
    :param homology: bool, whether doing homology search.
    :param acid_labile: bool, whether finding Acid Labile.
    :param methyl: bool, whether finding Methylations only.
    :return: tuple (basecallings_dataframe, basecallings), the first item is a DataFrame, 
    contains all the compounds that have at least one basecalling with other compounds; 
    the second item is a list of tuple [(Mass, Mass, Base)].
    """
    df = df.sort_index()
    mass_x = np.array(df.Mass)
    mass_y = np.array(df.Mass)
    if acid_labile:
        mass_x = np.array(df[df.isAD == False].Mass)
        mass_y = np.array(df[df.isAD == True].Mass)
    
    base_dict = {'C': 305.0413, 'A': 329.0525, 'G': 345.0474, 'U': 306.0253, 
                 'mG': 359.0631, 'D': 308.041, 'mC': 319.0569, 
                 'mU': 320.041, 'mA': 343.0682, 'm22C': 333.0625,
                'mcm5U': 378.0464}
    if homology:
        base_dict = {'C': 305.0413, 'A': 329.0525, 'Ox': 16.0, 'Methyl': 14.0106, '2Methyl': 28.0212}
    if acid_labile:
        base_dict = {'Y': 358.1599}
    if methyl:
        base_dict = {'Me': 14.0106}
        
    pairs = list()
    idxs = list()
    PPM = 10
    for k in base_dict.keys():
        base_mass = base_dict.get(k)
        ppm_matrix = np.abs((mass_x[:, np.newaxis] - mass_y - base_mass) * 1E6 / (mass_y + base_mass))
        ppm_df = pd.DataFrame(ppm_matrix)
        crosstalk = ppm_df[ppm_df < PPM]
        idx_pairs = list(crosstalk[crosstalk.notnull()].stack().index)
        if idx_pairs:
            idx_pairs = [(*pair, k) for pair in idx_pairs]
            pairs.extend(idx_pairs)
        df3_idxs = [pair[0] for pair in idx_pairs]
        df5_idxs = [pair[1] for pair in idx_pairs]
        df3_idxs = list(set(df3_idxs))
        df5_idxs = list(set(df5_idxs))
        if df3_idxs:
            idxs.extend(df3_idxs)
        if df5_idxs:
            idxs.extend(df5_idxs)
    
    mass_pairs = [(df.Mass.iloc[p[0]], df.Mass.iloc[p[1]], p[2]) for p in pairs]
    if acid_labile:
        mass_pairs = [(df[df.isAD == False].Mass.iloc[p[0]], df[df.isAD == True].Mass.iloc[p[1]], p[2]) for p in pairs]
    mass_pairs = [mp for mp in mass_pairs if mp[0] != mp[1]]
    idxs = list(set(idxs))
    
    df_base_calling = df.iloc[idxs]
    if acid_labile:
        masses = [pair[0] for pair in mass_pairs]
        masses_1 = [pair[1] for pair in mass_pairs]
        masses.extend(masses_1)
        masses = list(set(masses))
        df_base_calling = df[df.Mass.isin(masses)]
        
    return df_base_calling, mass_pairs