In [1]:
import pandas as pd
import numpy as np

In [None]:
MASS_P = 79.96633
MASS_H2O = 18.0106

In [None]:
def mass_sum(df3p, df5p, full_mass, error=0.1, has_tag=False, tag=826.3184):
    """Implementation of the MassSum algorithm.
    
    :param df3p: pandas DataFrame, the first dataset.
    :param df5p: pandas DataFrame, the second dataset.
    :param full_mass: float, the intact mass. 
    :param error: normal margin of error, default 0.1.
    :param has_tag, tag: if the biological sample was processed using a TAG.
    :return: a tuple of two datasets separated from original datasets. 
    One represents the compounds from 3' ladder, the other are the ones from 5' ladder.
    
    df3p and df5p can be the same DataFrame, if so, the two results datasets would be
    identical to each other.
    """
    df3p_mass_np = np.array(df3p['Mass'])
    df5p_mass_np = np.array(df5p['Mass'])
    mass_sum_np = df3p_mass_np[:, np.newaxis] + df5p_mass_np
    mass_sum_1 = np.round(mass_sum_np, 1)
    mass_sum_df = pd.DataFrame(mass_sum_1)

    if has_tag:
        sum_value = round(full_mass + tag + MASS_P - MASS_H2O, 1)
        print('full_mass {} sum_value {}'.format(full_mass, sum_value))
    else:
        sum_value = round(full_mass + MASS_H2O, 1)
        print('full_mass {} sum_value {}'.format(full_mass, sum_value))

    tmp = mass_sum_df[(mass_sum_df >= sum_value-error) & (mass_sum_df <= sum_value+error)]
    final_idx_pairs = list(tmp[tmp.notnull()].stack().index)
    df3_idxs = [pair[0] for pair in final_idx_pairs]
    df5_idxs = [pair[1] for pair in final_idx_pairs]
    df3_idxs = list(set(df3_idxs))
    df5_idxs = list(set(df5_idxs))

    df3p_selected = df3p.iloc[df3_idxs]
    df5p_selected = df5p.iloc[df5_idxs]
        
    return df3p_selected, df5p_selected