In [None]:
import pandas as pd
import numpy as np
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from collections import namedtuple
import networkx as nx
import random

In [None]:
%run ../modules/utils.ipynb
%run ../modules/cds.ipynb
%run ../modules/preprocessing.ipynb

In [None]:
df_phe = pd.read_excel('/Users/bryan/Documents/BioPharmaFinder/ZhangLab/Data/Excels/tRNA_Phe_0724.xlsx')
print(df_phe.shape)
df_phe = thermo_df(df_phe)
df_phe = df_phe[df_phe.Vol > 1E5]
plot_zone(df_phe)

# Split 3' and 5' Dots

In [None]:
idxs = list()
# create our callback function
def on_selection(trace, points, selector):
    print('points {}'.format(points.point_inds) )
    idxs.extend(points.point_inds)

def on_click(trace, points, selector):
#     print('points {}'.format(points.point_inds) )
    idxs.extend(points.point_inds)

df_sample = df_phe[df_phe.Mass < 25000]
f = zone_selection(df_sample, on_selection=on_selection, on_click=on_click)
f

In [None]:
df_chosen = df_sample.iloc[idxs]
df_chosen.shape

In [None]:
df_5p = df_chosen
df_5p.shape

In [None]:
df_3p = df_chosen
df_3p.shape

# Put&Get Splitted Ladder

In [None]:
df_5p.to_excel('/Users/bryan/Downloads/phe5p.xlsx')
df_3p.to_excel('/Users/bryan/Downloads/phe3p.xlsx')

In [None]:
df_5p = pd.read_excel('/Users/bryan/Downloads/phe5p.xlsx')
df_3p = pd.read_excel('/Users/bryan/Downloads/phe3p.xlsx')
df_5p.shape, df_3p.shape

# 25k Dots and their connections

In [None]:
df_phe_25k = df_phe[(df_phe.Mass > 23500) & (df_phe.Mass < 25000) & (df_phe.Vol > 1E6)]
px.scatter(x=df_phe_25k.Mass, y=df_phe_25k.Vol)

# Generate ladders by MassSum

In [None]:
df_3p = df_3p[df_3p.Vol>1E5]
df_5p = df_5p[df_5p.Vol>1E5]
df_3p.shape, df_5p.shape

In [None]:
full_mass = 24581.374101 
df_3p_cca, df_5p_cca = computational_data_seperation(df_3p, df_5p, full_mass, ignore_endpoints=True)
plot_zones(df_5p_cca, df_3p_cca)

# Refine 5' ladder

In [None]:
idxs = list()
# create our callback function
def on_selection(trace, points, selector):
    print('points {}'.format(points.point_inds) )
    idxs.extend(points.point_inds)

def on_click(trace, points, selector):
#     print('points {}'.format(points.point_inds) )
    idxs.extend(points.point_inds)

df_sample = df_5p_cca
f = zone_selection(df_sample, on_selection=on_selection, on_click=on_click)
f

In [None]:
df_chosen = df_sample.iloc[idxs]
df_5p_cca_ref = df_sample.drop(df_chosen.index)
df_5p_cca_ref.shape
# plot_zone(df_5p_cca_ref)
px.scatter(df_5p_cca_ref, x='Mass', y='RT')

# Mutual Verification

In [None]:
df_5p_cca_ref['Hit'] = 0
len = df_5p_cca_ref.shape[0]
for i in range(len):
    print('Processing {}'.format(df_5p_cca_ref.iloc[i].Mass))
    count = 0
    j_pool = set()
    N = 5 if i > 3 else 2
    while count < N:
        j = random.randint(0, len-1)
        if j == i or j in j_pool:
            continue
#         print('i {} j {}, len {}'.format(i, j, len))
        diff = abs(df_5p_cca_ref.iloc[i].Mass - df_5p_cca_ref.iloc[j].Mass)
        if diff < 400 or diff > 3000:
            continue
            
        j_pool.add(j)
        
        print('--handle Mass {}'.format( df_5p_cca_ref.iloc[j].Mass))
        df_condidates = components(diff)
        print(df_condidates)
        if not df_condidates.empty:
            print('Hit index {}.'.format(i))
            df_5p_cca_ref['Hit'].iloc[i] += 1
        
        count += 1

print(df_5p_cca_ref)
    

In [None]:
df_5p_cca_ref[df_5p_cca_ref.Hit<=2].sort_values('Mass')

In [None]:
components(918.0750643431002)

In [None]:
plot_zones(df_5p_cca_ref, df_5p_cca_ref[df_5p_cca_ref.Hit>2])

In [None]:
bcr = base_calling_random(df_5p_cca_ref[df_5p_cca_ref.Hit > 1])
plt, _ = plot_basecalling(*bcr, annotate=False)
tmp = df_5p_cca_ref[df_5p_cca_ref.Hit>1]
plt.scatter(tmp.Mass, tmp.RT)

In [None]:
px.scatter(df_5p_cca_ref, x='Mass', y='RT')

# Put Mass Ladder into Seats

In [None]:
df_5p_cca_ref[df_5p_cca_ref.Hit==0]

In [None]:
df_5p_cca_ref.to_excel('/Users/bryan/Documents/Presentations/Feb 3 2021/phe_5p_cca_ref.xlsx')

In [None]:
tmp = df_5p_cca_ref
process_mass_seats(tmp, full_mass)

# Fill in the Gap

In [None]:
df_5p.head()

In [None]:
TAG_5P = 79.9663
TAG_3OH = 18.0106

df_sample = df_5p_cca_ref
df_sample = df_sample.sort_values('Mass')
idxs = list()
gap_idxs = list()
for idx, row in df_sample.iterrows():
    print('Processing {}'.format(row.Mass))
    columns = ['Mass', 'RT', 'Vol']
    df_ends = pd.DataFrame(index=[0, 1], columns=columns)
    if not idxs:
        # handle gap between TAG and the first dot
        pre = TAG_5P + TAG_3OH
        df_ends.iloc[0] = [pre, 0.001, 0]
#         df_ends.iloc[0]['Mass'] = pre
#         df_ends.iloc[0]['RT'] = 0
#         df_ends.iloc[0]['Vol'] = 0
    else:
        pre = df_sample.loc[idxs[-1]].Mass
        df_ends.iloc[0] = df_sample.loc[idxs[-1]][columns]
    res = components(row.Mass - pre)
    if not res.empty:
        # keep this dot, and fill in the gap
        idxs.append(idx)
        
        df_ends.iloc[1] = row[columns]
        print(df_ends, df_ends.info())
        df_ends = df_ends.astype(float)
        df_alones = standalone_dots(df_5p, df_ends)
        print(df_alones)

In [None]:
%run ../modules/utils.ipynb

In [None]:
df_m = pd.read_excel('/Users/bryan/Documents/Presentations/Jan 27 2021/merged_cca_cc_5p.xlsx')

In [None]:
bcr = base_calling_random(df_m.drop_duplicates())
plt, _ = plot_basecalling(*bcr, annotate=False)
plt.savefig('/Users/bryan/Documents/Presentations/Jan 27 2021/merged_cca_cc_5p_basecalls.png', transparent=True)