In [1]:
import os
import sqlite3

import numpy as np
import pandas as pd

from tqdm import tqdm
from bisect import bisect
from loguru  import logger

from multiprocessing import Pool, cpu_count, Manager, Process, Value

pd.set_option('display.max_columns', None)
from check_input import check_db
from preprocessing import build_feature2ndscore, get_db_rid2rn, get_db_rn2fpath, return_pr2tr_id_map, return_nrt_width
from database import get_rid2chrom_conn, close_rid2chrom_conn, get_run_native2chrom_fpath
# from mrgroup import get_cmrg_messages
# from format_data import return_mr_features, initial_format, output_format
# from openswath_feature import get_os_features
# from discriminate import calc_score_cut, calc_results
# from reports import stats

In [2]:
db_fpath = "/mnt/data_nas/lyc/project/JointAnalysis/work/CCRCC/pyprophet/merged.osw"
chrom_dpath = "/mnt/data_nas/lyc/project/JointAnalysis/work/CCRCC/openswath"
work_dpath = "/mnt/data_nas/lyc/project/JointAnalysis/work/MCB/MCB_MSF_DDALib_top6_10Percent_Lib20240131/mrgd"
n_threads = 64
seed = 123
map_size = 32
fdr_precursor = 0.01
nrt_width_percent = 0.02

map_size = 2 ** map_size
# if not os.path.exists(work_dpath):
#     os.makedirs(work_dpath)
# log_fpath = os.path.join(work_dpath, "MRGDiscrim.log")
# logger.add(log_fpath, format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", mode="w")
logger.info(f'MRGDiscrim Workflow')


logger.info(f'Check db_infile: {db_fpath}')
check_db(db_fpath, logger)

feature2ndscore_fpath = os.path.join(work_dpath, "feature2ndscore.db")
logger.info(f'Save ndscores to db: {feature2ndscore_fpath}')
build_feature2ndscore(db_fpath, feature2ndscore_fpath, map_size)

logger.info(f'Organize the necessary inputs')
rid2rn = get_db_rid2rn(db_fpath)
rid_list = [k for k in rid2rn.keys()]
rn2chrom_fpath = get_db_rn2fpath(chrom_dpath, "sqMass")
pr2tr_id_map = return_pr2tr_id_map(db_fpath)

nrt_width = return_nrt_width(db_fpath, nrt_width_percent)

[32m2024-04-30 21:14:33.191[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mMRGDiscrim Workflow[0m
[32m2024-04-30 21:14:33.192[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mCheck db_infile: /mnt/data_nas/lyc/project/JointAnalysis/work/CCRCC/pyprophet/merged.osw[0m
[32m2024-04-30 21:14:33.522[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1mSave ndscores to db: /mnt/data_nas/lyc/project/JointAnalysis/work/MCB/MCB_MSF_DDALib_top6_10Percent_Lib20240131/mrgd/feature2ndscore.db[0m
[32m2024-04-30 21:18:03.381[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1mOrganize the necessary inputs[0m


In [3]:
rn2chrom_fpath_part = {'CPTAC_CCRCC_W_JHU_LUMOS_C3L-00966_NAT': '/mnt/data_nas/lyc/project/JointAnalysis/work/CCRCC/openswath/CPTAC_CCRCC_W_JHU_LUMOS_C3L-00966_NAT.mzML.chrom.sqMass',
 'CPTAC_CCRCC_W_JHU_LUMOS_C3L-00968_NAT': '/mnt/data_nas/lyc/project/JointAnalysis/work/CCRCC/openswath/CPTAC_CCRCC_W_JHU_LUMOS_C3L-00968_NAT.mzML.chrom.sqMass'}

In [4]:
rid2rn_part = {8686448202532635331: 'CPTAC_CCRCC_W_JHU_LUMOS_C3L-00966_NAT',
 9088139040828334229: 'CPTAC_CCRCC_W_JHU_LUMOS_C3L-00968_NAT'}

In [11]:
logger.info(f'Save nativeID2chromID')
rid2chrom_conn = get_rid2chrom_conn(rid2rn, rn2chrom_fpath)
rid_native2chromid_fpath = get_run_native2chrom_fpath(rid2chrom_conn, work_dpath, map_size / 8)
close_rid2chrom_conn(rid2chrom_conn)

[32m2024-04-30 21:22:14.595[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mSave nativeID2chromID[0m


In [12]:
logger.info(f'Get MRGroup')
m_conn = sqlite3.connect(db_fpath)
m_cur = m_conn.cursor()
m_cur.execute(f'SELECT ID FROM PRECURSOR')

precursor_ids = np.array(m_cur.fetchall()).squeeze()
m_cur.close()
m_conn.close()

num_precursor = precursor_ids.shape[0]
logger_n = 10 ** (len(str(num_precursor)) - 2)
n_precur = num_precursor // n_threads
precurs_list = [precursor_ids[i * n_precur : (i + 1) * n_precur].tolist() for i in range(n_threads)]
_ = [precurs_list[i].append(precursor_ids[i + n_precur * n_threads]) for i in range(len(precursor_ids) - n_precur * n_threads)]

results_collector = Manager().list()
counter = Manager().Value('d',0)
logger.info(f"( {counter.value} / {num_precursor}) precursor has Calculated...")
extractors = []

[32m2024-04-30 21:32:43.173[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mGet MRGroup[0m
[32m2024-04-30 21:32:43.815[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1m( 0 / 187312) precursor has Calculated...[0m


In [13]:
import lmdb
def get_rid_native2chromid_db(rid_native2chromid_fpath: dict) -> dict:
    
    rid_native2chromid_db = {}
    rid_native2chromid_cur = {}   
    for k, v in rid_native2chromid_fpath.items():
        rid_native2chromid_db[k] = lmdb.open(v, create=False)
        rid_native2chromid_cur[k] = rid_native2chromid_db[k].begin(write=False)
    return rid_native2chromid_db, rid_native2chromid_cur

def get_ndscore(f2nds_cur, feature_id) -> float:
    
    # with f2nds_conn.begin(write=False) as txn:
    #     value = txn.get(str(feature_id).encode('utf-8'))
    #     return(float(value.decode('utf-8')))

    value = f2nds_cur.get(str(feature_id).encode('utf-8'))
    return(float(value.decode('utf-8')))

m_conn = sqlite3.connect(f'file:{db_fpath}?mode=ro', uri=True)
# m_conn.close()
results = []
f2nds_conn = lmdb.open(feature2ndscore_fpath)
f2nds_cur = f2nds_conn.begin(write=False)
rid_native2chromid_db, rid_native2chromid_cur = get_rid_native2chromid_db(rid_native2chromid_fpath)
rid2chrom_conn = get_rid2chrom_conn(rid2rn, rn2chrom_fpath)

results = []

In [14]:
import numba
import PyMSNumpress
import zlib
import numpy as np

from bisect import bisect

@numba.jit(nopython = True)
def smooth_array_nb(arr):
    # arr = np.array(arr)
    new_arr = np.zeros_like(arr)
    new_arr[0] = (2 * arr[0] + arr[1]) / 3
    new_arr[-1] = (2 * arr[-1] + arr[-2]) / 3
    for x in range(1, len(arr) - 1):
        new_arr[x] = (0.5*arr[x] + 0.25*arr[x-1] + 0.25*arr[x+1])
    return new_arr
def get_chromid_from_rid_native_id(db, search_key):
    value = db.get(search_key.encode('utf-8'))
    return value.decode('utf-8') if value else None

In [26]:
import numba
import PyMSNumpress
import zlib
import numpy as np

from bisect import bisect

# def organize_chroms_dimension(run_p_chroms: list, run_p_rts: list) -> np.array:
#     pr_chrom = np.array(run_p_chroms[0])
#     fr_chroms = np.array(run_p_chroms[1:])
#     pr_rt = run_p_rts[0]
#     fr_rt = run_p_rts[1]

#     for left_i in range(len(fr_rt)):
#         left_rtid = bisect(pr_rt, fr_rt[left_i]) - 1
#         if left_rtid >= 0:
#             break
#     if (left_i - left_rtid) == 0:
#         pass
#     elif left_rtid - left_i > 0:
#         pr_rt = pr_rt[left_rtid - left_i :]
#         pr_chrom = pr_chrom[left_rtid - left_i :]
#     elif left_i - left_rtid > 0:
#         fr_chroms = fr_chroms[:, left_i - left_rtid:]
#         fr_rt = fr_rt[left_i - left_rtid:]

#     rev_fr_rt = list(-np.array(fr_rt)[::-1])
#     rev_pr_rt = list(-np.array(pr_rt)[::-1])

#     for right_i in range(len(rev_pr_rt)):
#         right_rtid = bisect(rev_fr_rt, rev_pr_rt[right_i]) - 1
#         if right_rtid >= 0:
#             break
#     if (right_i - right_rtid) == 0:
#         pass
#     elif right_rtid - right_i > 0:
#         fr_rt = fr_rt[: -(right_rtid - right_i)]
#         fr_chroms = fr_chroms[:, : -(right_rtid - right_i)]
#     elif right_i - right_rtid > 0:
#         pr_rt = pr_rt[: -(right_i - right_rtid)]
#         pr_chrom = pr_chrom[: -(right_i - right_rtid)]

#     run_p_xics = np.concatenate((pr_chrom[np.newaxis, :], fr_chroms), axis = 0)

#     return run_p_xics, pr_rt


def organize_chroms_dimension11(run_p_chroms: list, run_p_rts: list) -> np.array:
    pr_chrom = np.array(run_p_chroms[0])
    fr_chroms = np.array(run_p_chroms[1:])
    pr_rt = np.array(run_p_rts[0])
    fr_rt = np.array(run_p_rts[1])

    if len(pr_rt) == len(fr_rt):
        return np.array(run_p_chroms), pr_rt
    else:

        for psi in range(len(pr_rt)):
            psi_left = pr_rt[psi]
            psi_right = pr_rt[psi+1]
            sleft_label = fr_rt >= psi_left
            sright_label = fr_rt < psi_right
            s_label = sleft_label & sright_label
            if np.sum(s_label) > 0:
                break
        fsi = np.where(s_label)[0][0]

        for fei in range(len(fr_rt) - 1, 0, -1):
            fei_left = fr_rt[fei - 1]
            fei_right = fr_rt[fei]
            sleft_label = pr_rt > fei_left
            sright_label = pr_rt <= fei_right
            s_label = sleft_label & sright_label
            if np.sum(s_label) > 0:
                break
        pei = np.where(s_label)[0][0]

        run_p_xics = np.concatenate((pr_chrom[psi: pei+1][np.newaxis, :], fr_chroms[:, fsi:fei+1]), axis = 0)

        assert run_p_xics.shape[1] == len(pr_rt[psi: pei+1]), print("Error")

        return run_p_xics, pr_rt[psi: pei+1]

In [27]:
from chromatographic import organize_chroms_dimension
from tqdm import tqdm

for prec_id in tqdm(precurs_list[0]):
# prec_id = precurs_list[0][454]


    p_table = pd.read_sql_query(f'SELECT * FROM FEATURE WHERE PRECURSOR_ID = {prec_id}', m_conn)
    if p_table.shape[0] == 0:
        # continue
        print("continue")
    ndscores = []
    for f_id in p_table["ID"]:
        ndscores.append(get_ndscore(f2nds_cur, f_id))
    p_table["NORM_DSCORE"] = ndscores

    p_u_rids = p_table["RUN_ID"].unique()
    p_prec_ids = p_table["PRECURSOR_ID"].unique()
    if len(p_prec_ids) != 1:
        logger.error(f'Error')
        raise
    p_prec_id = p_prec_ids[0]
    p_native_ids = ["%s_Precursor_i0"%p_prec_id] + pr2tr_id_map[prec_id]



    p_rid2xics, p_rid2rts = {}, {}
    for p_u_rid in p_u_rids:
        # if p_u_rid != 8686448202532635331:
        #     continue
        # print(p_u_rid)
        run_conn = rid2chrom_conn[p_u_rid]
        run_cur = run_conn.cursor()
        run_p_rts = []
        run_p_chroms = []
        for p_native_id in p_native_ids:
            run_p_chrom_id = int(get_chromid_from_rid_native_id(rid_native2chromid_cur[p_u_rid], str(p_native_id)))
            comp_data = run_cur.execute(f'SELECT COMPRESSION, DATA FROM DATA WHERE CHROMATOGRAM_ID == {run_p_chrom_id}').fetchall()

            for comp, xic in comp_data:
                result = []
                if comp == 5:
                    PyMSNumpress.decodeLinear(zlib.decompress(xic), result)
                    run_p_rts.append(result)
                elif comp == 6:
                    PyMSNumpress.decodeSlof(zlib.decompress(xic), result)
                    run_p_chroms.append(smooth_array_nb(np.array(result)).tolist())
                else:
                    logger.error("Error: PyMSNumpress")
                    raise
        run_cur.close()
        # for i in range(len(run_p_rts)):
        #     print(len(run_p_rts[i]), len(run_p_chroms[i]))
        run_p_xics, run_pr_rt = organize_chroms_dimension11(run_p_chroms, run_p_rts)

  2%|▏         | 61/2927 [03:05<2:25:27,  3.05s/it]


KeyboardInterrupt: 

In [18]:
for i in range(len(run_p_chroms)):
    print(len(run_p_rts[i]), len(run_p_rts[i]))

174 174
174 174
174 174
174 174
174 174
174 174
174 174


In [23]:
run_p_rts[0][:5], run_p_rts[1][:5], run_p_rts[2][:5]

([6428.3, 6431.9, 6435.3, 6438.9, 6442.6],
 [6428.3, 6431.8, 6435.3, 6438.8, 6442.5],
 [6428.3, 6431.8, 6435.3, 6438.8, 6442.5])

In [22]:
run_p_rts[0][-5:], run_p_rts[1][-5:], run_p_rts[2][-5:]

([7011.5, 7014.9, 7018.3, 7021.8, 7025.3],
 [7011.4, 7014.8, 7018.3, 7021.7, 7025.2],
 [7011.4, 7014.8, 7018.3, 7021.7, 7025.2])