In [2]:
import os
import sqlite3

import numpy as np
import pandas as pd

from tqdm import tqdm
from bisect import bisect
from loguru  import logger

from multiprocessing import Pool, cpu_count, Manager, Process, Value

# pd.set_option('display.max_columns', None)
from check_input import check_db
from preprocessing import build_feature2ndscore, get_db_rid2rn, get_db_rn2fpath, return_pr2tr_id_map, return_nrt_intervel
from database import get_rid2chrom_conn, close_rid2chrom_conn, get_run_native2chrom_fpath
from mrgroup import get_cmrg_messages
from format_data import return_mr_features, initial_format, output_format
from openswath_feature import get_os_features
from discriminate import calc_score_cut, calc_results


debug_mode = False
# 4G
map_size = 36
fdr_precursor = 0.01

nrt_interval_percent = 5e-4
nrt_width_percent = 0.02

n_mrg = 3
min_nuf = 2

n_threads = cpu_count()

seed = 123

db_fpath = "/mnt/data_nas/lyc/project/JointDIA/work/002-MCB/DDALib/10Rawdatas-10Percent-top6-Lib20240131/openswath/test_pp/merged.osw"
chrom_dpath = "/mnt/data_nas/lyc/project/JointDIA/work/002-MCB/DDALib/10Rawdatas-10Percent-top6-Lib20240131/openswath/test_osw"
work_dpath = "/mnt/data_nas/lyc/project/JointAnalysis/work/MCB/MCB_MSF_DDALib_top6_10Percent_Lib20240311/test_jointAnalysis"

ModuleNotFoundError: No module named 'PyMSNumpress'

In [2]:
map_size = 2 ** map_size
if not os.path.exists(work_dpath):
    os.makedirs(work_dpath)
log_fpath = os.path.join(work_dpath, "JointAnalysis.log")
logger.add(log_fpath, format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", mode="w")
logger.info(f'JointAnalysis Workflow')

[32m2024-03-30 09:37:19.188[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mJointAnalysis Workflow[0m


In [4]:
logger.info(f'Check db_infile: {db_fpath}')
check_db(db_fpath, logger)

feature2ndscore_fpath = os.path.join(work_dpath, "feature2ndscore.db")
logger.info(f'Save ndscores to db: {feature2ndscore_fpath}')
build_feature2ndscore(db_fpath, feature2ndscore_fpath, map_size)

logger.info(f'Organize the necessary inputs')
rid2rn = get_db_rid2rn(db_fpath)
rid_list = [k for k in rid2rn.keys()]
rn2chrom_fpath = get_db_rn2fpath(chrom_dpath, "sqMass")
pr2tr_id_map = return_pr2tr_id_map(db_fpath)

nrt_intervel, nrt_width = return_nrt_intervel(db_fpath, nrt_interval_percent, nrt_width_percent)

[32m2024-03-30 09:40:14.467[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mCheck db_infile: /mnt/data_nas/lyc/project/JointDIA/work/002-MCB/DDALib/10Rawdatas-10Percent-top6-Lib20240131/openswath/test_pp/merged.osw[0m
[32m2024-03-30 09:40:14.578[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mSave ndscores to db: /mnt/data_nas/lyc/project/JointAnalysis/work/MCB/MCB_MSF_DDALib_top6_10Percent_Lib20240311/test_jointAnalysis/feature2ndscore.db[0m
[32m2024-03-30 09:41:12.547[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mOrganize the necessary inputs[0m


In [5]:
logger.info(f'Save nativeID2chromID')
rid2chrom_conn = get_rid2chrom_conn(rid2rn, rn2chrom_fpath)
rid_native2chromid_fpath = get_run_native2chrom_fpath(rid2chrom_conn, work_dpath, map_size / 8)
close_rid2chrom_conn(rid2chrom_conn)

[32m2024-03-30 09:41:56.097[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mSave nativeID2chromID[0m


In [6]:
logger.info(f'Get MRGroup')
m_conn = sqlite3.connect(db_fpath)
m_cur = m_conn.cursor()
m_cur.execute(f'SELECT ID FROM PRECURSOR')

precursor_ids = np.array(m_cur.fetchall()).squeeze()
m_cur.close()
m_conn.close()

num_precursor = precursor_ids.shape[0]
logger_n = 10 ** (len(str(num_precursor)) - 2)
n_precur = num_precursor // n_threads
precurs_list = [precursor_ids[i * n_precur : (i + 1) * n_precur].tolist() for i in range(n_threads)]
_ = [precurs_list[i].append(precursor_ids[i + n_precur * n_threads]) for i in range(len(precursor_ids) - n_precur * n_threads)]

results_collector = Manager().list()
counter = Manager().Value('d',0)
logger.info(f"( {counter.value} / {num_precursor}) precursor has Calculated...")
extractors = []
for precur_ids in precurs_list:
    p = Process(target = get_cmrg_messages, 
                args =  (precur_ids, db_fpath, feature2ndscore_fpath, rid_native2chromid_fpath,
                            pr2tr_id_map, rid2rn, rn2chrom_fpath, nrt_intervel, nrt_width,
                            n_mrg, min_nuf, logger_n, debug_mode, results_collector, counter, num_precursor, logger, ))
    p.daemon = True
    extractors.append(p)
    p.start()
for p in extractors:
    p.join()

logger.info(f'Get MR features')
mr_iter_features = return_mr_features(results_collector)
del results_collector, counter

logger.info(f'Get OS features')
target_fids = mr_iter_features["FEATURE_ID"].values
os_feature_pd = get_os_features(target_fids, db_fpath)



[32m2024-03-30 09:43:35.122[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mGet MRGroup[0m
[32m2024-03-30 09:43:36.538[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1m( 0 / 243530) precursor has Calculated...[0m
[32m2024-03-30 09:48:16.288[0m | [1mINFO    [0m | [36mmrgroup[0m:[36mget_cmrg_messages[0m:[36m253[0m - [1m( 10000 / 243530) precursor has Calculated...[0m
[32m2024-03-30 09:51:01.770[0m | [1mINFO    [0m | [36mmrgroup[0m:[36mget_cmrg_messages[0m:[36m253[0m - [1m( 20000 / 243530) precursor has Calculated...[0m
[32m2024-03-30 09:53:51.180[0m | [1mINFO    [0m | [36mmrgroup[0m:[36mget_cmrg_messages[0m:[36m253[0m - [1m( 30000 / 243530) precursor has Calculated...[0m
[32m2024-03-30 09:56:47.655[0m | [1mINFO    [0m | [36mmrgroup[0m:[36mget_cmrg_messages[0m:[36m253[0m - [1m( 40000 / 243530) precursor has Calculated...[0m
[32m2024-03-30 09:59:34.483[0m | [1mINFO    [0m | 

In [7]:
logger.info(f'Initial format')
mr_iter_features = initial_format(db_fpath, mr_iter_features, os_feature_pd)



[32m2024-03-30 10:39:28.885[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mInitial format[0m


In [8]:
mr_iter_features.columns

Index(['PRECURSOR_ID', 'MRG_RANK', 'NUF', 'MR_JOINT_SCORE_SUM',
       'MR_JOINT_SCORE', 'MR_SCORE_1', 'MR_SCORE_2', 'MR_SCORE_3',
       'MR_SCORE_4', 'MR_SCORE_5', 'MR_SCORE_6', 'MR_SCORE_7', 'MR_SCORE_8',
       'MR_SCORE_9', 'decoy', 'run_id', 'RT', 'delta_rt', 'leftWidth',
       'rightWidth', 'Intensity', 'aggr_prec_Peak_Area', 'aggr_prec_Peak_Apex',
       'VAR_MASSDEV_SCORE_MS1', 'VAR_MI_SCORE_MS1',
       'VAR_MI_CONTRAST_SCORE_MS1', 'VAR_MI_COMBINED_SCORE_MS1',
       'VAR_ISOTOPE_CORRELATION_SCORE_MS1', 'VAR_ISOTOPE_OVERLAP_SCORE_MS1',
       'VAR_XCORR_COELUTION_MS1', 'VAR_XCORR_COELUTION_CONTRAST_MS1',
       'VAR_XCORR_COELUTION_COMBINED_MS1', 'VAR_XCORR_SHAPE_MS1',
       'VAR_XCORR_SHAPE_CONTRAST_MS1', 'VAR_XCORR_SHAPE_COMBINED_MS1',
       'VAR_BSERIES_SCORE_MS2', 'VAR_DOTPROD_SCORE_MS2',
       'VAR_INTENSITY_SCORE_MS2', 'VAR_ISOTOPE_CORRELATION_SCORE_MS2',
       'VAR_ISOTOPE_OVERLAP_SCORE_MS2', 'VAR_LIBRARY_CORR_MS2',
       'VAR_LIBRARY_DOTPROD_MS2', 'VAR_LIBRARY_M

In [10]:
    logger.info(f'Discriminate')
    ignored_columns = ["PRECURSOR_ID", "decoy", "run_id", "RT", "delta_rt", "rightWidth", "leftWidth", "Intensity", "aggr_prec_Peak_Area", "aggr_prec_Peak_Apex"]
    iter_mr_columns = [col for col in mr_iter_features.columns if col not in ignored_columns]

    mr_iter_res = calc_results(scored_columns = iter_mr_columns,
                               initial_column = "MRG_RANK",
                               initial_ascending = True, 
                               data_pd = mr_iter_features,
                               n_threads = n_threads,
                               seed = seed)

    logger.info(f'Output results')
    mr_iter_res = output_format(db_fpath, mr_iter_res)

[32m2024-03-30 19:10:54.646[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mDiscriminate[0m








[32m2024-03-30 19:30:37.350[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mOutput results[0m


In [12]:
 results = output_format(db_fpath, mr_iter_res)

trans = []
for _, (mseq, charge, decoy) in enumerate(zip(results["FullPeptideName"].values, results["Charge"].values, results["decoy"].values)):
    if decoy == 0:
        trans.append(mseq + "_" + str(charge))
    else:
        trans.append("DECOY_" + mseq + "_" + str(charge))
results["transition_group_id"] = trans
results_format = results.loc[:, ["transition_group_id",
                                 "decoy",
                                 "run_id",
                                 "filename", 
                                 "RT",
                                 "assay_rt",
                                 "delta_rt",
                                 "iRT",
                                 "Sequence",
                                 "FullPeptideName",
                                 "Charge",
                                 "mz",
                                 "Intensity",
                                 "aggr_prec_Peak_Area",
                                 "aggr_prec_Peak_Apex",
                                 "leftWidth",
                                 "rightWidth",
                                 "ProteinName",
                                 "jd_score"]]

In [37]:
results_format.head(4)

Unnamed: 0,transition_group_id,decoy,run_id,filename,RT,assay_rt,delta_rt,iRT,Sequence,FullPeptideName,Charge,mz,Intensity,aggr_prec_Peak_Area,aggr_prec_Peak_Apex,leftWidth,rightWidth,ProteinName,jd_score
0,.(UniMod:1)AAAAAAAAAAAAGDSDSWDADTFSMEDPVRK_3,0,7979900529762267651,/mnt/data_nas/lyc/project/JointDIA/data/MCB/BG...,6626.1,6681.5413,-55.4413,82.9356,AAAAAAAAAAAAGDSDSWDADTFSMEDPVRK,.(UniMod:1)AAAAAAAAAAAAGDSDSWDADTFSMEDPVRK,3,1018.124485,5947190.0,4173890.0,1751130.0,6601.324707,6638.442383,sp|Q66JS6|EI3JB_MOUSE,0.961575
1,.(UniMod:1)AAAAAAAAAAGAAGGR_2,0,7979900529762267651,/mnt/data_nas/lyc/project/JointDIA/data/MCB/BG...,4557.35,4606.799,-49.449,58.1088,AAAAAAAAAAGAAGGR,.(UniMod:1)AAAAAAAAAAGAAGGR,2,620.823278,24966500.0,311562100.0,78170650.0,4540.854492,4573.848633,sp|Q8CCS6|PABP2_MOUSE,0.999886
2,.(UniMod:1)AAAAAAAGGAALAVSTGLETATLQK_2,0,7979900529762267651,/mnt/data_nas/lyc/project/JointDIA/data/MCB/BG...,7360.27,7421.6273,-61.3573,91.7463,AAAAAAAGGAALAVSTGLETATLQK,.(UniMod:1)AAAAAAAGGAALAVSTGLETATLQK,2,1114.100071,4590070.0,5403477.0,1948072.0,7344.550781,7378.718262,sp|Q9CQ25|MZT2_MOUSE,0.996836
3,.(UniMod:1)AAAAAAAGGAALAVSTGLETATLQK_3,0,7979900529762267651,/mnt/data_nas/lyc/project/JointDIA/data/MCB/BG...,7360.34,7420.5903,-60.2503,91.7471,AAAAAAAGGAALAVSTGLETATLQK,.(UniMod:1)AAAAAAAGGAALAVSTGLETATLQK,3,743.069139,1466010.0,1120476.0,812546.8,7347.208008,7374.554199,sp|Q9CQ25|MZT2_MOUSE,0.99959


In [39]:
from reports import stats

In [40]:
results_format = stats(results_format, "jd_score", logger)