In [None]:
import pandas as pd
import numpy as np
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from collections import namedtuple
import networkx as nx
import random

In [None]:
%run ../modules/utils.ipynb
%run ../modules/cds.ipynb
%run ../modules/preprocessing.ipynb

In [None]:
df_phe = pd.read_excel('/Users/bryan/Documents/BioPharmaFinder/ZhangLab/Data/Excels/tRNA_Phe_0724.xlsx')
print(df_phe.shape)
df_phe = thermo_df(df_phe)
plot_zone(df_phe)

In [None]:
df_phe.to_excel('/home/bryan/Downloads/tRNA_Phe_0724_std.xlsx')

# Split 3' and 5' Dots

In [None]:
idxs = list()
# create our callback function
def on_selection(trace, points, selector):
    print('points {}'.format(points.point_inds) )
    idxs.extend(points.point_inds)

def on_click(trace, points, selector):
#     print('points {}'.format(points.point_inds) )
    idxs.extend(points.point_inds)

df_sample = df_phe[df_phe.Mass < 25000]
f = zone_selection(df_sample, on_selection=on_selection, on_click=on_click)
f

In [None]:
df_chosen = df_sample.iloc[idxs]
df_chosen.shape

In [None]:
df_5p = df_chosen
df_5p.shape

In [None]:
df_3p = df_chosen
df_3p.shape

# Put&Get Splitted Ladder

In [None]:
df_5p.to_excel('/Users/bryan/Downloads/phe5p.xlsx')
df_3p.to_excel('/Users/bryan/Downloads/phe3p.xlsx')

In [None]:
df_5p = pd.read_excel('/Users/bryan/Downloads/phe5p.xlsx')
df_3p = pd.read_excel('/Users/bryan/Downloads/phe3p.xlsx')
df_5p.shape, df_3p.shape

In [None]:
df_5p = df_5p[df_5p.Vol > 1E5]
df_3p = df_3p[df_3p.Vol > 1E5]
df_5p.shape, df_3p.shape

# 25k Dots and their connections

In [None]:
df_phe_25k = df_phe[(df_phe.Mass > 23500) & (df_phe.Mass < 25000) & (df_phe.Vol > 1E6)]
px.scatter(x=df_phe_25k.Mass, y=df_phe_25k.Vol)

# Generate ladders by MassSum

In [None]:
full_mass = 24581.374101 
df_3p_cca, df_5p_cca = computational_data_seperation(df_3p, df_5p, full_mass, ignore_endpoints=True)
plot_zones(df_5p_cca, df_3p_cca)

# Refine 5' ladder

In [None]:
idxs = list()
# create our callback function
def on_selection(trace, points, selector):
    print('points {}'.format(points.point_inds) )
    idxs.extend(points.point_inds)

def on_click(trace, points, selector):
#     print('points {}'.format(points.point_inds) )
    idxs.extend(points.point_inds)

df_sample = df_5p_cca
f = zone_selection(df_sample, on_selection=on_selection, on_click=on_click)
f

In [None]:
df_chosen = df_sample.iloc[idxs]
df_5p_cca_ref = df_sample.drop(df_chosen.index)
df_5p_cca_ref.shape
plot_zone(df_5p_cca_ref)

# Mutual Verification

In [None]:
df_5p_cca_ref['Hit'] = 0
len = df_5p_cca_ref.shape[0]
for i in range(len):
    print('Processing {}'.format(df_5p_cca_ref.iloc[i].Mass))
    count = 0
    j_pool = set()
    while count < 5:
        j = random.randint(0, len-1)
        if j == i or j in j_pool:
            continue
#         print('i {} j {}, len {}'.format(i, j, len))
        diff = abs(df_5p_cca_ref.iloc[i].Mass - df_5p_cca_ref.iloc[j].Mass)
        if diff < 400 or diff > 3000:
            continue
            
        j_pool.add(j)
        
        print('--handle Mass {}'.format( df_5p_cca_ref.iloc[j].Mass))
        df_condidates = components(diff)
        print(df_condidates)
        if not df_condidates.empty:
            print('Hit index {}.'.format(i))
            df_5p_cca_ref['Hit'].iloc[i] += 1
        
        count += 1

print(df_5p_cca_ref)
    

In [None]:
df_5p_cca_ref[df_5p_cca_ref.Hit>2].sort_values('Mass')

In [None]:
components(918.0750643431002)

In [None]:
plot_zones(df_5p_cca_ref, df_5p_cca_ref[df_5p_cca_ref.Hit>2])

In [None]:
bcr = base_calling_random(df_5p_cca_ref[df_5p_cca_ref.Hit > 2])
plt, _ = plot_basecalling(*bcr, annotate=False)
tmp = df_5p_cca_ref[df_5p_cca_ref.Hit>2]
# plt.scatter(tmp.Mass, tmp.RT)

In [None]:
px.scatter(df_sample_refine, x='Mass', y='RT')

# Put Mass Ladder into Seats

In [None]:
df_5p_cca_ref[df_5p_cca_ref.Hit<=3]

In [None]:
df_5p_cca_ref[df_5p_cca_ref.Hit>0].to_excel('/home/bryan/Downloads/phe_5p_cca_ref.xlsx')

In [None]:
df_5p_cca_ref = pd.read_excel('/home/bryan/Downloads/phe_5p_cca_ref.xlsx')

In [None]:
tmp = df_5p_cca_ref[df_5p_cca_ref.Hit>2]
process_mass_seats(tmp, full_mass)

# Fill in the Gap

In [None]:
fullmass_dot = df_5p[(df_5p.Mass>full_mass-0.1) & (df_5p.Mass<full_mass+0.1)]

In [None]:
fullmass_dot

In [None]:
%run ../modules/utils.ipynb

In [None]:
TAG_5P = 79.9663
TAG_3OH = 18.0106

df_sample = df_5p_cca_ref.copy()
df_sample = df_sample.sort_values('Mass', ascending=False)
idxs = list()
for idx, row in df_sample.iterrows():
    print('Processing backward {}'.format(row.Mass))
    columns = ['Mass', 'RT', 'Vol']
    df_ends = pd.DataFrame(index=[0, 1], columns=columns)
    if not idxs:
        # handle gap between TAG and the last dot(fullmass dot)
        pre = fullmass_dot.iloc[0]['Mass'] + TAG_5P
        df_ends.iloc[0] = fullmass_dot.iloc[0][columns]
        df_ends.iloc[1]['Mass'] += TAG_5P
    else:
        pre = df_sample.loc[idxs[-1]].Mass
        df_ends.iloc[0] = df_sample.loc[idxs[-1]][columns]
    
    print('components {}-{}'.format(row.Mass, pre))
    res = components(row.Mass - pre)
    if not res.empty:
        # keep this dot
        print('keep this dot {}'.format(row.Mass))
        idxs.append(idx)
        
df_sample_refine = df_sample.loc[idxs].copy()
print(df_sample.shape, df_sample_refine.shape)

df_sample_refine = df_sample_refine.sort_values('Mass')

idxs = list()
gap_idxs = list()
for idx, row in df_sample_refine.iterrows():
    print('Processing forward {}'.format(row.Mass))
    columns = ['Mass', 'RT', 'Vol']
    df_ends = pd.DataFrame(index=[0, 1], columns=columns)
    
    is_first = True if not idxs else False
    if is_first:
        # handle gap between TAG and the first dot
        pre = TAG_5P + TAG_3OH
        df_ends.iloc[0] = [pre, 0.001, 0]
    else:
        pre = df_sample.loc[idxs[-1]].Mass
        df_ends.iloc[0] = df_sample.loc[idxs[-1]][columns]
    
    is_last = True if idx == df_sample_refine.index[-1] else False
    if not is_last and row.Mass - pre < 600:
        continue
    print('components {}-{}'.format(row.Mass, pre))
    res = components(row.Mass - pre)
    if not res.empty:
        # keep this dot, and fill in the gap
        print('keep this dot {}'.format(row.Mass))
        idxs.append(idx)
#         continue
        df_ends.iloc[1] = row[columns]
#         print(df_ends, df_ends.info())
        df_ends = df_ends.astype(float)
        df_alones = standalone_dots(df_5p, df_ends, mode='all')
        if not df_alones.empty:
            print('extend dots ', df_alones)
            gap_idxs.extend(df_alones.index)
#         print(df_alones)

    if is_last:
        df_ends.iloc[0] = df_sample.loc[idxs[-1]][columns]
        df_ends.iloc[1] = fullmass_dot.iloc[0][columns]
        df_ends.iloc[1]['Mass'] += TAG_5P
        print('Processing the last item. {}'.format(df_ends))
        df_alones = standalone_dots(df_5p, df_ends, mode='all')
        if not df_alones.empty:
            print('extend dots ', df_alones)
            gap_idxs.extend(df_alones.index)
        
        # append the fullmass dot
        gap_idxs.append(fullmass_dot.index[0])

print(gap_idxs)
print(df_5p.loc[gap_idxs])

In [None]:
px.scatter(df_sample_refine, x='Mass', y='RT')

In [None]:
df_5p[(df_5p.Mass > 3014.34251378776) & (df_5p.Mass < 4006.45213)]

In [None]:
df_gap_dots = df_5p.loc[gap_idxs]
df_gap_dots

In [None]:
df_gap_dots_a = df_gap_dots.copy()

In [None]:
df_gap_dots_odd = df_gap_dots.copy()

In [None]:
df_gap_dots_even = df_gap_dots.copy()

In [None]:
plot_zones(df_gap_dots, df_sample_refine)

In [None]:
df_merge = pd.concat([df_sample, df_gap_dots])
df_merge = df_merge.sort_values('Mass')
df_merge.to_excel('~/Downloads/phe_5p_cca_res.xlsx')

# Presentation

In [None]:
plt = plot_zone(df_5p_cca)
plt.savefig('/home/bryan/Downloads/Jan27/cca_5p.png', transparent=True)

In [None]:
plt = plot_zone(df_sample_refine)
plt.savefig('/home/bryan/Downloads/Jan27/cca_5p_ref.png', transparent=True)

In [None]:
plt = plot_zones(df_gap_dots, df_sample_refine, shift_color=True)
plt.savefig('/home/bryan/Downloads/Jan27/cca_5p_ref_fill.png', transparent=True)

In [None]:
%run ../modules/utils.ipynb