In [1]:
import pandas as pd
import matplotlib
import plotly.express as px
import matplotlib.pyplot as plt

In [211]:
%run ../modules/utils.ipynb
%run ../modules/ladder_separation.ipynb
%run ../modules/homology_search.ipynb
%run ../modules/mass_sum.ipynb
%run ../modules/gap_fill.ipynb
%run ../modules/ladder_complementation.ipynb

## Load Control Sample
<br>
We have two types of samples, control sample and degraded sample. We find isoforms from control sample, and get ladder fragments from degraded sample.

In [1]:
# replace ctrl_sample to switch to other control samples.
ctrl_sample = '../samples/Phe/tRNA_Phe_Control.xlsx'

df_ctrl = load_data(ctrl_sample)
plotly_zone(df_ctrl)

## Homology Search
<br>
This method tries to find out data points in ~24k Mass area that have A/C/Ox/Me connections, where as A stands for nucleotide A, C stands for nucleotide C and Ox stands for Oxygen.
If more connections are desired, go to base_dict in base_calling_random() in modules/homology_search.ipynb to add more.
<br>
Homology Search will result in several data points, we call them isoforms. In the later processes, we will try to get ladders for each isoform. And at the end, we align those ladders horizontally and do ladder complementation.

In [2]:
bcr = homology_search(df_ctrl)
plotly_basecalling(*bcr, y='Vol')

In [3]:
"""We use the top data points(isoforms) in our MLC-Seq paper.

Here, we use iloc[:10] to print the top 10 compounds obtained 
from the list sorted in descending order of intensity. 
Change 10 to whatever number you want to print more or less compounds.
"""
df_homology = bcr[0]
df_homology.sort_values('Vol', ascending=False).iloc[:10]

## Load Degraded Sample

In [4]:
df = load_data('../samples/Phe/tRNA_Phe_Deg_1.xlsx') # sample 1
df2 = load_data('../samples/Phe/tRNA_Phe_Deg_2.xlsx') # sample 2
plotly_zone(df)

## Homology Search on Degraded Sample
<br>
By comparing the Homology Search results before and after acid-degradation, the acid-labile nucleotide, if there is any, can be identified.
Here in this sample tRNA-Phe, we do observe an acid-labile Y changed to Y´after acid treatment. 
<br>
<br>
Because all the rest processes are based on degraded sample, so, in this case, we will use data points found from degraded sample as our target isoforms in the later processes.

In [5]:
"""iloc[:10] takes the first 10 compounds from the list sorted in the descending order of 
intensity. Replace 10 with other numbers if more or less compounds are desired.
"""
df_deg_top = df[df.Mass>23000].sort_values('Vol', ascending=False).iloc[:10]
homo_deg = homology_search(df_deg_top)
plotly_basecalling(*homo_deg, y='Vol')
df_homo_deg = homo_deg[0]
df_homo_deg

In [125]:
# Here are the isoforms we used in our MLC-Seq paper
isoform1 = 24252.311092
isoform2 = 24581.380920
isoform3 = 24268.303777
isoform4 = 24597.353308
isoform5 = isoform1 - C  # exist in sample 2

isoforms = [isoform1, isoform2, isoform3, isoform4, isoform5] 

## Manually Ladder Separation
<br>
This step is to roughly separate the original data into two portions, one contains 5´ data and the other contains 3´. 

**ATTENTION**
<br>
This section contains three code cells. We label them 1, 2, and 3 according to the order in which they appear. cell 1&2 together are doing exactly the same thing as cell 3. The difference is that by running 1&2, you choose the data manually from the 2D plot by yourself; by running cell 3, you get the same data as we chose by loading it from datasets. 
So, you have two options here, just choose one of them:
1. Running cells 1&2, experience what exactly the code is doing. Or
2. Running cell 3, in case you don't want to get your hand dirty.

In [6]:
"""Please use the lasso tool to circle out the approximately 5´/3´ data.
It's ok to just select a rough area, and it's impossible to choose the exact 5´/3´ data, 
because we are only circling the data points from a 2D plot. It's also unnecessary because our 
target of this step is simply to decrease the complexity of the following processes, but 
not pin down the accurate 5´/3´ ladder fragments.
"""
idxs = list()

# create our callback function
def on_selection(trace, points, selector):
    print('points {}'.format(points.point_inds) )
    idxs.extend(points.point_inds)

def on_click(trace, points, selector):
    idxs.extend(points.point_inds)

df_sample = df
f = zone_selection(df_sample, on_selection=on_selection, on_click=on_click)

# this line shows the widget, it's a 2D widget that contains a toolbar 
# and enable users to ramdomly choose data points inside.
f

In [11]:
df_chosen = df_sample.iloc[idxs]

"""Uncomment one of the following lines, depending on which data(3´ or 5´) 
you selected from previous cell. This step will save data into two separate pandas DataFrames. 
Both DataFrames will be used in the following cells. 
You have to re-run the previous cell in this section when switching between 5´and 3´data.
"""
# df_5p = df_chosen.copy()
# df_3p = df_chosen.copy()

In [7]:
"""To easily demonstrate the concept, here we just load the data we selected in advance.
Again, you may find that there exists  overlapped area, it doesn't matter.
"""
df_5p = load_data('./data/phe5p.xlsx')
df_3p = load_data('./data/phe3p.xlsx')

df_common = match_dfs(df_5p, df_3p)
plotly_multi_zones([df_5p, df_3p, df_common], names=["5´ Zone", "3´ Zone", "Overlapped Area"], title="plot both areas together")

## Sampling the Data and Using the Top 1000

In [8]:
"""We take the first 1000 data points on the list sorted in descending order of intensity. 
Change sampling_num to other values to choose more or less data points. Adjustments may need 
to be made here to obtain optimized results.
"""
sampling_num = 1000

df_5p_top = df_5p.sort_values('Vol', ascending=False).iloc[:sampling_num]
df_3p_top = df_3p.sort_values('Vol', ascending=False).iloc[:sampling_num]

plotly_zones(df_5p_top, df_3p_top)

## MassSum
<br>
This method finds out data points in pairs. Any mass pairs meeting the MassSum equation for a given intact mass will be selected.
<br>
<br>
If there exist two data points in pair from 5´/3´ ladders separately for a specific isoform, they are guaranteed to be selected. But if there are two noise data points with their Mass sum that happens to equal to the isoform's, these two will also be selected into our pool. So, occasionally, manual work might be involved to filter the result of MassSum.



In [131]:
# Five isoforms
df_masssum_3ps, df_masssum_5ps = list(), list()
for fullmass in isoforms:
    df_ms_3p, df_ms_5p = mass_sum(df_3p_top, df_5p_top, full_mass=fullmass)
    df_masssum_3ps.append(df_ms_3p)
    df_masssum_5ps.append(df_ms_5p)

### Processing the 1<sup>st</sup> Isoform

In [9]:
# isoform 1, 5´ ladder
isoform_idx = 0
df_masssum_1st_5p = df_masssum_5ps[isoform_idx].copy()
plotly_zone(df_masssum_1st_5p)

In [10]:
"""After manually examine each mass in the ladder, we found the following compounds 
can not fit in the ladder: [22032.08, 21726.04, 2238.242], because these masses cannot 
generate basecallings with their neighbors. Their indices are [3597, 3593, 567], we 
remove these outliers from our ladder.
"""
idxs = [3597, 3593, 567]
df_notfit = df_masssum_1st_5p.loc[idxs].sort_values('Mass')
df_masssum_1st_5p = df_masssum_1st_5p.drop(df_notfit.index)
plotly_zones(df_masssum_1st_5p, df_notfit, 
             title='5´ Ladder of the 1st Isoform', 
             names=['5´ data', 'To Be Removed'])
df_notfit

In [11]:
# isoform 1, 3´ ladder
df_masssum_1st_3p = df_masssum_3ps[isoform_idx].copy()
plotly_zone(df_masssum_1st_3p)

In [12]:
"""After manually examine each mass in the ladder, we found the following compounds 
can not fit in the ladder: [2238.242, 2544.278, 22032.077], because these masses cannot 
generate basecallings with their neighbors. Their indices are [431, 566, 2129], we 
remove these outliers from our ladder.
"""
idxs = [431, 566, 2129]
df_notfit = df_masssum_1st_3p.loc[idxs].sort_values('Mass')
df_masssum_1st_3p = df_masssum_1st_3p.drop(df_notfit.index)
plotly_zones(df_masssum_1st_3p, df_notfit, 
             title='3´ Ladder of the 1st Isoform', 
             names=['3´ data', 'To Be Removed'])
df_notfit

### Processing the 2<sup>nd</sup> Isoform

In [13]:
# isoform 2, 5´ ladder
isoform_idx = 1
df_masssum_2nd_5p = df_masssum_5ps[isoform_idx].copy()
plotly_zone(df_masssum_2nd_5p, title='5´ Ladder of the 2nd Isoform (MassSum Result)')

In [14]:
"""After manually examine each mass in the ladder, we found the following compounds 
can not fit in the ladder: [1931.226, 2567.311, 5457.693, 5600.784, 19141.657, 20449.842, 
21585.084], because these masses cannot generate basecallings with their neighbors. Their 
indices are [374, 810, 2009, 2057, 3525, 3566, 3592], we remove these outliers from our ladder.
"""
idxs = [374, 810, 2009, 2057, 3525, 3566, 3592]
df_notfit = df_masssum_2nd_5p.loc[idxs].sort_values('Mass')
df_masssum_2nd_5p = df_masssum_2nd_5p.drop(df_notfit.index)
plotly_zones(df_masssum_2nd_5p, df_notfit, 
             title='5´ Ladder of the 2nd Isoform (MassSum Result with Manual Filter)', 
             names=['5´ data', 'To Be Removed'])
df_notfit

In [15]:
# isoform 2, 3´ ladder
df_masssum_2nd_3p = df_masssum_3ps[isoform_idx].copy()
plotly_zone(df_masssum_2nd_3p, title='2nd Isoform, 3´ Ladder (MassSum Result)')

In [16]:
"""After manually examine each mass in the ladder, we found the following compounds 
can not fit in the ladder: [3014.435, 4149.555, 5457.693, 18998.662, 19141.657, 22032.077, 
22668.148], because these masses cannot generate basecallings with their neighbors. Their 
indices are [719, 1047, 1301, 2054, 2057, 2129, 2140], we remove these outliers from our ladder.
"""
idxs = [719, 1047, 1301, 2054, 2057, 2129, 2140]
df_notfit = df_masssum_2nd_3p.loc[idxs].sort_values('Mass')
df_masssum_2nd_3p = df_masssum_2nd_3p.drop(df_notfit.index)
plotly_zones(df_masssum_2nd_3p, df_notfit, 
             title='2nd Isoform, 3´ Ladder (MassSum Result with Manual Filter)', 
             names=['3´ data', 'To Be Removed'])
df_notfit

### Processing the 3<sup>rd</sup> Isoform

In [17]:
# isoform 3, 5´ ladder
isoform_idx = 2
df_masssum_3rd_5p = df_masssum_5ps[isoform_idx].copy()
plotly_zone(df_masssum_3rd_5p, title='5´ Ladder of the 3rd Isoform (MassSum Result)')

In [18]:
"""Manually examine each mass in the ladder, we found the following compounds 
are far away from the major part: [11567.583, 19141.657, 23722.166], Their indices 
are [3160, 3525, 3613], we remove these outliers from our ladder.
"""
idxs = [3160, 3525, 3613]
df_notfit = df_masssum_3rd_5p.loc[idxs].sort_values('Mass')
df_masssum_3rd_5p = df_masssum_3rd_5p.drop(df_notfit.index)
plotly_zones(df_masssum_3rd_5p, df_notfit, 
             title='5´ Ladder of the 3rd Isoform (MassSum Result with Manual Filter)', 
             names=['5´ data', 'To Be Removed'])
df_notfit

In [19]:
# isoform 2, 3´ ladder
df_masssum_3rd_3p = df_masssum_3ps[isoform_idx].copy()
plotly_zone(df_masssum_3rd_3p, title='3rd Isoform, 3´ Ladder (MassSum Result)')

In [20]:
"""Manually examine each mass in the ladder, we found the following compounds 
are far away from the major part: [563.993, 3118.378], and [5144.672] cannot generate 
basecallings with their neighbors. Their indices are [4, 742, 1245], we remove these 
outliers from our ladder.
"""
idxs = [4, 742, 1245]
df_notfit = df_masssum_3rd_3p.loc[idxs].sort_values('Mass')
df_masssum_3rd_3p = df_masssum_3rd_3p.drop(df_notfit.index)
plotly_zones(df_masssum_3rd_3p, df_notfit, 
             title='3´ Ladder of the 3rd Isoform (MassSum Result with Manual Filter)', 
             names=['3´ data', 'To Be Removed'])
df_notfit

### Processing the 4<sup>th</sup> Isoform

In [21]:
# isoform 4, 5´ ladder
isoform_idx = 3
df_masssum_4th_5p = df_masssum_5ps[isoform_idx].copy()
plotly_zone(df_masssum_4th_5p, title='5´ Ladder of the 4th Isoform (MassSum Result)')

In [22]:
"""Manually examine each mass in the ladder, we are not sure about the mass less than
6269.791, and [20309.82, 21726.04, 22032.08] cannot generate basecallings with their 
neighbors.Their indices are [3562, 3593, 3597], we remove these masses from our ladder.
"""
idxs = [3562, 3593, 3597]
df_notfit1 = df_masssum_4th_5p[df_masssum_4th_5p.Mass<6269.79]
df_notfit2 = df_masssum_4th_5p.loc[idxs]
df_notfit = pd.concat([df_notfit1, df_notfit2])
df_masssum_4th_5p = df_masssum_4th_5p.drop(df_notfit.index)
plotly_zones(df_masssum_4th_5p, df_notfit, 
             title='5´ Ladder of the 4th Isoform (MassSum Result with Manual Filter)', 
             names=['5´ data', 'To Be Removed'])
df_notfit

In [23]:
# isoform 2, 3´ ladder
df_masssum_4th_3p = df_masssum_3ps[isoform_idx].copy()
plotly_zone(df_masssum_4th_3p, title='4th Isoform, 3´ Ladder (MassSum Result)')

In [24]:
"""Manually examine each mass in the ladder, we are not sure about the mass greater than
18345.59, and [2583.294, 2889.322, 4305.625] cannot generate basecallings with their 
neighbors. Their indices are [3562, 3593, 3597], we remove these masses from our ladder.
"""
idxs = [587, 687, 1080]
df_notfit1 = df_masssum_4th_3p[df_masssum_4th_3p.Mass>18345.59]
df_notfit2 = df_masssum_4th_3p.loc[idxs]
df_notfit = pd.concat([df_notfit1, df_notfit2])
df_masssum_4th_3p = df_masssum_4th_3p.drop(df_notfit.index)
plotly_zones(df_masssum_4th_3p, df_notfit, 
             title='3´ Ladder of the 4th Isoform (MassSum Result with Manual Filter)', 
             names=['3´ data', 'To Be Removed'])
df_notfit

In [202]:
# put the refined results into two separated new list. 
df_masssum_5ps_refined = [df_masssum_1st_5p, 
                          df_masssum_2nd_5p, 
                          df_masssum_3rd_5p, 
                          df_masssum_4th_5p]

df_masssum_3ps_refined = [df_masssum_1st_3p, 
                          df_masssum_2nd_3p, 
                          df_masssum_3rd_3p, 
                          df_masssum_4th_3p]

## GapFill
<br>
This method finds the missing data points for MassSum. Basically it looks data points inside the gap, if there's one in the result of MassSum, from original dataset. Just like MassSum, sometimes GapFill also need manual work involved to check the result data points.

### Processing the 1<sup>st</sup> Isoform

In [25]:
# GapFill for 1st isoform, 5´ ladder
idx = 0

fullmass_dot = df_5p[(df_5p.Mass>isoforms[idx]-0.1) & (df_5p.Mass<isoforms[idx]+0.1)]
df_ms_5p = df_masssum_5ps_refined[idx]
df_gap_1st_5p = gap_fill(df_5p, df_ms_5p, fullmass_dot, major=True, orientation=5)

In [26]:
plotly_zones(df_masssum_1st_5p, df_gap_1st_5p)
df_ms_gf_1st_5p = pd.concat([df_masssum_1st_5p, df_gap_1st_5p])

In [27]:
"""Manually examine each mass in the ladder, we found the outliers: [2685.364, 3678.533], 
and [1065.1*, 8296.076] cannot generate basecallings with their neighbors. Their indices 
are [62, 63, 916, 1463, 2725], we remove these compounds from our ladder.
"""
idxs = [62, 63, 916, 1463, 2725]
df_notfit = df_ms_gf_1st_5p.loc[idxs].sort_values('Mass')
df_ms_gf_1st_5p = df_ms_gf_1st_5p.drop(df_notfit.index)
plotly_zones(df_ms_gf_1st_5p, df_notfit, 
             title='5´ Ladder of the 1st Isoform (MassSum & GapFill Result with Manual Filter)', 
             names=['5´ data', 'To Be Removed'])
df_notfit

In [28]:
# GapFill for 1st isoform, 3´ ladder
fullmass_dot = df_3p[(df_3p.Mass>isoforms[idx]-0.1) & (df_3p.Mass<isoforms[idx]+0.1)]
df_ms_3p = df_masssum_3ps_refined[idx]
df_gap_1st_3p = gap_fill(df_3p, df_ms_3p, fullmass_dot, major=True, orientation=3)

In [29]:
plotly_zones(df_masssum_1st_3p, df_gap_1st_3p)
df_ms_gf_1st_3p = pd.concat([df_masssum_1st_3p, df_gap_1st_3p])

In [30]:
"""Manually examine each mass in the ladder, we found the masses [862.233, 876.273, 8262.144, 
11855.662, 13719.925, 22526.179] cannot generate basecallings with their neighbors. Their 
indices are [76, 80, 1571, 1787, 1844, 2138], we remove these compounds from our ladder.
"""
idxs = [76, 80, 1571, 1787, 1844, 2138]
df_notfit = df_ms_gf_1st_3p.loc[idxs].sort_values('Mass')
df_ms_gf_1st_3p = df_ms_gf_1st_3p.drop(df_notfit.index)
plotly_zones(df_ms_gf_1st_3p, df_notfit, 
             title='3´ Ladder of the 1st Isoform (MassSum & GapFill Result with Manual Filter)', 
             names=['3´ data', 'To Be Removed'])
df_notfit

### Processing the 2<sup>nd</sup> Isoform

In [31]:
# GapFill for 1st isoform, 5´ ladder
idx = 1

fullmass_dot = df_5p[(df_5p.Mass>isoforms[idx]-0.1) & (df_5p.Mass<isoforms[idx]+0.1)]
df_ms_5p = df_masssum_5ps_refined[idx]
df_gap_2nd_5p = gap_fill(df_5p, df_ms_5p, fullmass_dot, major=True, orientation=5)

In [32]:
plotly_zones(df_masssum_2nd_5p, df_gap_2nd_5p)
df_ms_gf_2nd_5p = pd.concat([df_masssum_2nd_5p, df_gap_2nd_5p])

In [33]:
"""Manually examine each mass in the ladder, we found the outliers: [2377.226, 3373.339, 
3732.437], and [1065.122, 1065.156, 3359.326, 3387.422, 3944.507, 24353.33] cannot generate 
basecallings with their neighbors. Their indices are [62, 63, 695, 696, 1303, 1312, 1320, 
1485, 1572, 3641], we remove these compounds from our ladder.
"""
idxs = [62, 63, 695, 696, 1303, 1312, 1320, 1485, 1572, 3641]
df_notfit = df_ms_gf_2nd_5p.loc[idxs].sort_values('Mass')
df_ms_gf_2nd_5p = df_ms_gf_2nd_5p.drop(df_notfit.index)
plotly_zones(df_ms_gf_2nd_5p, df_notfit, 
             title='5´ Ladder of the 2nd Isoform (MassSum & GapFill Result with Manual Filter)', 
             names=['5´ data', 'To Be Removed'])
df_notfit

In [34]:
# GapFill for 1st isoform, 3´ ladder
fullmass_dot = df_3p[(df_3p.Mass>isoforms[idx]-0.1) & (df_3p.Mass<isoforms[idx]+0.1)]
df_ms_3p = df_masssum_3ps_refined[idx]
df_gap_2nd_3p = gap_fill(df_3p, df_ms_3p, fullmass_dot, major=True, orientation=3)

In [35]:
plotly_zones(df_masssum_2nd_3p, df_gap_2nd_3p)
df_ms_gf_2nd_3p = pd.concat([df_masssum_2nd_3p, df_gap_2nd_3p])

In [36]:
"""Manually examine each mass in the ladder, looks good.
"""
idxs = []
df_notfit = df_ms_gf_2nd_3p.loc[idxs].sort_values('Mass')
df_ms_gf_2nd_3p = df_ms_gf_2nd_3p.drop(df_notfit.index)
plotly_zones(df_ms_gf_2nd_3p, df_notfit, 
             title='3´ Ladder of the 2nd Isoform (MassSum & GapFill Result with Manual Filter)', 
             names=['3´ data', 'To Be Removed'])
df_notfit

In [213]:
# Put the refined results into two separated new list. 
# Currently we only refined 5´/3´-ladders for the first two isoforms.
# So keep the rest as it is.
df_ms_gf_5ps_refined = [df_ms_gf_1st_5p, 
                          df_ms_gf_2nd_5p, 
                          df_masssum_3rd_5p, 
                          df_masssum_4th_5p]

df_ms_gf_3ps_refined = [df_ms_gf_1st_3p, 
                          df_ms_gf_2nd_3p, 
                          df_masssum_3rd_3p, 
                          df_masssum_4th_3p]

# Ladder Complementation
<br>
After manual selection of the resulting ladders for various isoforms, they are put together to perform ladder complementation.


In [214]:
"""Currently we only provides ladders of two isoforms, we can still do ladder complementation.

If you have recurringly run the previous cells for more isoforms, you will definitely 
end up with much more ladders. Just construct a Ladder instance for each ladder, and 
then put all of them into the variable ladders, you will collect more information to 
generate better results.
"""
ladders = list()
for idx in range(2):
    ladder_5p = Ladder(df_ms_gf_5ps_refined[idx], isoforms[idx], 5)
    ladder_3p = Ladder(df_ms_gf_3ps_refined[idx], isoforms[idx], 3)
    ladders.extend([ladder_5p, ladder_3p])
    
df_ladder_comp = ladder_complementation(ladders)

# Under the folder <outputs>, please find the result file, ladder_comp_res.xlsx.
df_ladder_comp.to_excel('../outputs/ladder_comp_res.xlsx')