In [66]:
import os
import json
import pandas as pd
import json
from components import db_functions
from components.parsing import parse_parameters
parameters = parse_parameters('parameters.json')
rlist = pd.read_excel('combined runlist.xlsx')
db_file = os.path.join('data','db','proteogyver.db')
db_conn = db_functions.create_connection(db_file)
current = db_functions.get_full_table_as_pd(db_conn, 'ms_runs')
db_conn.close()

In [47]:
ms_cols = [
    'run_id TEXT PRIMARY KEY',
    'run_name TEXT NOT NULL',
    'sample_name TEXT NOT NULL',
    'file_name TEXT NOT NULL',
    'run_time TEXT NOT NULL',
    'run_date TEXT NOT NULL',
    'instrument TEXT NOT NULL',
    'author TEXT NOT NULL',
    'sample_type TEXT NOT NULL',
    'run_type TEXT NOT NULL',
    'lc_method TEXT NOT NULL',
    'ms_method TEXT NOT NULL',
    'num_precursors INTEGER NOT NULL',
    'bait TEXT',
    'bait_uniprot TEXT',
    'bait_mutation TEXT',
    'chromatogram_max_time INTEGER NOT NULL',
    'cell_line_or_material TEXT',
    'project TEXT',
    'author_notes TEXT',
    'bait_tag TEXT'
]
keytypes = {
    'auc': 'REAL NOT NULL',
    'intercepts': 'INTEGER NOT NULL',
    'avg_peaks_per_timepoint': 'REAL NOT NULL',
    'mean_intensity': 'INTEGER NOT NULL',
    'max_intensity': 'INTEGER NOT NULL',
    'json': 'TEXT NOT NULL',
    'trace': 'TEXT NOT NULL', 
    'intercept_json': 'TEXT NOT NULL'
}
for typ in ['MSn_filtered','TIC','MSn_unfiltered']:
    for key in ['auc','intercepts','avg_peaks_per_timepoint','mean_intensity','max_intensity', 'json','trace', 'intercept_json']:
        ms_cols.append(f'{typ.lower()}_{key.lower()} {keytypes[key]}')
        

In [295]:
data_path = '/media/kmsaloka/Expansion/20241025 tics4'
old = os.path.join(data_path,'ms_runs')
new = os.path.join(data_path,'newone','ms_runs')
old = [old, os.listdir(old)]
new = [new, os.listdir(new)]

In [114]:
if False: # only need to run once to fix dicts
    for fname in new[1]:
        with open(os.path.join(new[0],fname)) as fil:
            tjson = json.load(fil)
        for k in [k for k in tjson.keys() if '_sers' in k]:
            del tjson[k]
        with open(os.path.join(new[0], fname),'w',encoding='utf8') as fil:
            json.dump(tjson,fil,indent=2)

In [15]:
nt = [[],[]]
from datetime import datetime
print('start', datetime.now())
errs = []
for n in new[1]:
    with open(os.path.join(new[0],n)) as fil:
        j = json.load(fil)
    sid = j['SampleID']
    if 'Tomppa' in sid:
        sid = sid.split('_')[0]
    try:
        sid = int(sid)
    except ValueError:
        errs.append([n, sid])
        continue
    if j['MSname']=='Tomppa':
        nt[1].append(sid)
    else:
        nt[0].append(sid)
print('new done', datetime.now())
ot = [[],[]]
for n in old[1]:
    with open(os.path.join(old[0],n)) as fil:
        j = json.load(fil)
    sid = j['SampleID']
    if 'Tomppa' in sid:
        sid = sid.split('_')[0]
    try:
        sid = int(sid)
    except ValueError:
        errs.append([n, sid])
        continue
    if j['MSname']=='Tomppa':
        ot[1].append(sid)
    else:
        ot[0].append(sid)
     
print('old done', datetime.now())

start 2024-11-15 10:14:08.232504
new done 2024-11-15 10:25:32.887990
old done 2024-11-15 10:36:52.728568


In [83]:
def get_mstable_insert(ms_cols, ms_run_datadir, runlist, time_format,runs_done=None,banned_run_dirs = None):
    ms_runs_insert_sql = []
    data_to_enter = []
    failed_json_files = []
    if banned_run_dirs is None:
        banned_run_dirs = [
            'BRE_20_xxxxx_Helsinki',
            'TrapTrouble_3'
        ]
    if runs_done is None:
        runs_done = set()
    for i, datafilename in enumerate(os.listdir(ms_run_datadir)):
        if i % 100 == 0:
            print(i)
        with open(os.path.join(ms_run_datadir, datafilename)) as fil:
            try:
                dat = json.load(fil)
            except json.JSONDecodeError:
                failed_json_files.append(['json decode error', datafilename, ''])
                continue
        if dat['SampleID'] in runs_done: continue
        if dat['SampleInfo'] == ['']: continue
        banned = False
        for b in banned_run_dirs:
            if b in dat['SampleInfo']['SampleTable']['AnalysisHeader']['@FileName']:
                banned = True
        if banned:
            failed_json_files.append(['banned',datafilename, dat])
            continue
        runs_done.add(dat['SampleID'])
        lc_method = None
        ms_method = None
        if isinstance(dat['SampleInfo'], list):
            failed_json_files.append(['no sample info',datafilename, dat])
            continue
        if not 'polarity_1' in dat:
            failed_json_files.append(['no polarity',datafilename, dat])
            continue
        for propdic in dat['SampleInfo']['SampleTable']['SampleTableProperties']['Property']:
            if propdic['@Name'] == 'HyStar_LC_Method_Name':
                lc_method = propdic['@Value']
            if propdic['@Name'] == 'HyStar_MS_Method_Name':
                ms_method = propdic['@Value']
        sample_names = {
            dat['SampleInfo']['SampleTable']['Sample']['@SampleID'],
            dat['SampleInfo']['SampleTable']['Sample']['@SampleID']+'.d',
            dat['SampleInfo']['SampleTable']['Sample']['@DataPath'],
        }
        samplerow = runlist[runlist['Raw file'].isin(sample_names)]
        if (lc_method is None) or (ms_method is None):
            failed_json_files.append(['lc or ms method is none', datafilename, ''])
            continue
        if len([k for k in dat.keys() if 'polarity' in k]) > 1:
            failed_json_files.append(['too much polarity in keys', datafilename, ''])
            continue
        if samplerow.shape[0] == 0:
            samplerow = pd.Series(index = samplerow.columns, data = ['No data' for c in samplerow.columns])
        else:
            samplerow = samplerow.iloc[0]
        instrument = 'TimsTOF 1'
        frame_df_name = f'{instrument} {dat["SampleID"]}'
        frame_df = pd.read_json(json.dumps(dat['Frames']),orient='split')
        runtime = datetime.strftime(
            datetime.strptime(
                dat['SampleInfo']['SampleTable']['AnalysisHeader']['@CreationDateTime'].split('+')[0],
                '%Y-%m-%dT%H:%M:%S'
            ),
            time_format
        )    
        samplename = samplerow['Sample name']
        author = samplerow['Who']
        sample_type = samplerow['Sample type']
        bait = samplerow['Bait name']
        bait_uniprot = samplerow['Bait / other uniprot or ID']
        bait_mut = samplerow['Bait mutation']
        cell_line = samplerow['Cell line / material']
        project = samplerow['Project']
        author_notes = samplerow['Notes']
        bait_tag = samplerow['tag']
        try:
            precur = dat['NumPrecursors']
        except KeyError:
            precur = 'No precursor data'
        ms_run_row = [
            dat['SampleID'],
            dat['SampleInfo']['SampleTable']['AnalysisHeader']['@SampleID'],
            samplename,
            dat['SampleInfo']['SampleTable']['AnalysisHeader']['@FileName'],
            runtime,
            runtime.split()[0],
            instrument,
            author,
            sample_type,
            dat['DataType'],
            lc_method,
            ms_method,
            precur,
            bait,
            bait_uniprot,
            bait_mut,
            len(pd.Series(dat['polarity_1']['tic df']['Series'])),
            cell_line,
            project,
            author_notes,
            bait_tag
        ]
        for dataname in ['bpc filtered df', 'tic df', 'bpc unfiltered df']:
            ms_run_row.extend([
                dat['polarity_1'][dataname]['auc'],
                dat['polarity_1'][dataname]['intercepts'],
                dat['polarity_1'][dataname]['peaks_per_timepoint'],
                dat['polarity_1'][dataname]['mean_intensity'],
                dat['polarity_1'][dataname]['max_intensity'],
                json.dumps(dat['polarity_1'][dataname]['Series']),
                dat['polarity_1'][dataname]['trace'],
                json.dumps(dat['polarity_1'][dataname]['intercept_dict']),
            ])   
        
        data_to_enter.append(ms_run_row)
    for data in data_to_enter:
        add_str = f'INSERT INTO ms_runs ({", ".join([c.split()[0] for c in ms_cols])}) VALUES ({", ".join(["?" for _ in ms_cols])})'
        ms_runs_insert_sql.append([add_str, data])
    return (ms_runs_insert_sql, failed_json_files)

In [115]:
new_dat, new_failed_fils = get_mstable_insert(ms_cols, os.path.join(data_path,'newone','ms_runs'), rlist, parameters['Config']['Time format'])
#old_dat, old_failed_fils = get_mstable_insert(ms_cols, os.path.join(data_path,'ms_runs'), rlist, parameters['Config']['Time format'])

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900


In [116]:
import pickle
def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)
data_so_far = {
    'new_dat': new_dat,
    'new_failed_fils': new_failed_fils,
    'old_dat': old_dat,
    'old_failed_fils': old_failed_fils
}
save_object(data_so_far, 'data_so_far.pkl')

In [146]:
ucols = new_dat[0][0].split(')')[0].split('(')[1].split(', ')
odf = pd.DataFrame(columns=ucols[1:],data=[n[1][1:] for n in old_dat],index=[n[1][0] for n in old_dat])
odf.index.name='run_id'
ndf = pd.DataFrame(columns=ucols[1:],data=[n[1][1:] for n in new_dat],index=[n[1][0] for n in new_dat])
ndf.index.name='run_id'

In [159]:
new_inst = []
for index,row in ndf.iterrows():
    if 'Tomppa' in index:
        new_inst.append('Tomppa')
    else:
        new_inst.append('Timppa')
ndf['instrument'] = new_inst
new_inst = []
new_ind = []
for index,row in odf.iterrows():
    if 'Tomppa' in row['file_name']:
        new_inst.append('Tomppa')
        new_ind.append(f'{index}_Tomppa')
    else:
        new_inst.append('Timppa')
        new_ind.append(index)
odf.index = new_ind
odf['instrument'] = new_inst
odf.index.name='run_id'

In [268]:
ne = ndf[~ndf['file_name'].str.contains('Blank')]
ol = odf[~odf['file_name'].str.contains('Blank')]
ol = ol[~ol['file_name'].str.contains('K562')]
ne = ne[~ne['file_name'].str.contains('K562')]
ne = ne[ne.index.isin(ol.index)].sort_index()
ol = ol[ol.index.isin(ne.index)].sort_index()
ol['num_precursors'] = ol['num_precursors'].replace('No precursor data',-1)
ne['num_precursors'] = ne['num_precursors'].replace('No precursor data',-1)
diffs = {}
ndiffs = 0
for i, row in ne.iterrows():
    did = False
    for c in ne.columns:
        if pd.isna(row[c]) and pd.isna(ol.loc[i][c]):
            continue
        if row[c] != ol.loc[i][c]:
            if c not in diffs: diffs[c] = set()
            diffs[c].add(i)
            did = True
    if did:
        ndiffs += 1

In [283]:
odf.index = odf.index.astype(str)
ndf.index = ndf.index.astype(str)

In [287]:
odf[~odf.index.isin(ndf.index)].iloc[-1]['file_name']

'D:\\Data\\Evosep runs\\Tanja\\TRIM47_Mikko_Mayranpaa\\WT_N3_BIO_S1-D3_1_2318.d'

In [299]:
import numpy as np
np.log2([1,2,3])

array([0.       , 1.       , 1.5849625])

In [293]:
import shutil

In [292]:
[i for i, n in enumerate(new_failed_fils) if '2318' in n[1]]

[]

In [285]:
ndf[~ndf.index.isin(odf.index)]

Unnamed: 0_level_0,run_name,sample_name,file_name,run_time,run_date,instrument,author,sample_type,run_type,lc_method,...,tic_trace,tic_intercept_json,msn_unfiltered_auc,msn_unfiltered_intercepts,msn_unfiltered_avg_peaks_per_timepoint,msn_unfiltered_mean_intensity,msn_unfiltered_max_intensity,msn_unfiltered_json,msn_unfiltered_trace,msn_unfiltered_intercept_json
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1143_Tomppa,Blank,No data,D:\Data\Blanks\1143_Tomppa_Blank_S2-H4.d,2024-04-26 15:16:04,2024-04-26,Tomppa,No data,No data,DIAPASEF,60 samples per day.m,...,"{""name"":""1143_Tomppa"",""x"":[0,1,2,3,4,6,7,8,9,1...","{""52"": 1547970.4784810126, ""53"": 1547970.47848...",6.222790e+10,37,4.980237,51520.153968,882644,"{""0"": 8662.0, ""1"": 10751.0, ""2"": 9170.0, ""3"": ...","{""name"":""1143_Tomppa"",""x"":[0,1,2,3,4,5,6,7,8,9...","{""674"": 51520.153968253966, ""676"": 51520.15396..."
1146_Tomppa,Blank,No data,D:\Data\Blanks\1146_Tomppa_Blank_S2-A5.d,2024-04-26 16:26:54,2024-04-26,Tomppa,No data,No data,DIAPASEF,60 samples per day.m,...,"{""name"":""1146_Tomppa"",""x"":[0,1,2,3,4,5,7,8,9,1...","{""0"": 525703.8160337553, ""25"": 525703.81603375...",2.478740e+10,25,4.980237,21104.191270,1665698,"{""0"": 14884.0, ""1"": 11555.0, ""2"": 11035.0, ""3""...","{""name"":""1146_Tomppa"",""x"":[0,1,2,3,4,5,6,7,8,9...","{""8"": 21104.19126984127, ""9"": 21104.1912698412..."
1149_Tomppa,Blank,No data,D:\Data\Blanks\1149_Tomppa_Blank_S2-B5.d,2024-04-26 17:37:43,2024-04-26,Tomppa,No data,No data,DIAPASEF,60 samples per day.m,...,"{""name"":""1149_Tomppa"",""x"":[0,1,2,3,4,6,7,8,9,1...","{""0"": 461727.54261603375, ""3"": 461727.54261603...",2.172553e+10,41,4.980237,18842.750000,830405,"{""0"": 16238.0, ""1"": 15517.0, ""2"": 16275.0, ""3""...","{""name"":""1149_Tomppa"",""x"":[0,1,2,3,4,5,6,7,8,9...","{""178"": 18842.75, ""179"": 18842.75, ""523"": 1884..."
1152_Tomppa,Blank,No data,D:\Data\Blanks\1152_Tomppa_Blank_S2-C5.d,2024-04-26 18:48:31,2024-04-26,Tomppa,No data,No data,DIAPASEF,60 samples per day.m,...,"{""name"":""1152_Tomppa"",""x"":[0,1,2,3,4,6,7,8,9,1...","{""0"": 357244.29620253167, ""33"": 357244.2962025...",1.340310e+10,67,4.980237,12953.590476,289766,"{""0"": 15101.0, ""1"": 14291.0, ""2"": 13589.0, ""3""...","{""name"":""1152_Tomppa"",""x"":[0,1,2,3,4,5,6,7,8,9...","{""0"": 12953.590476190477, ""7"": 12953.590476190..."
1155_Tomppa,Blank,No data,D:\Data\Blanks\1155_Tomppa_Blank_S2-D5.d,2024-04-26 19:59:20,2024-04-26,Tomppa,No data,No data,DIAPASEF,60 samples per day.m,...,"{""name"":""1155_Tomppa"",""x"":[0,1,2,3,5,6,7,8,9,1...","{""523"": 514387.96202531643, ""525"": 514387.9620...",2.511913e+10,59,4.980237,22276.418254,1110811,"{""0"": 11508.0, ""1"": 12768.0, ""2"": 9935.0, ""3"":...","{""name"":""1155_Tomppa"",""x"":[0,1,2,3,4,5,6,7,8,9...","{""521"": 22276.418253968255, ""523"": 22276.41825..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900,Zlatka_SUCDIA_B2,Zlatka_SUCDIA_B2,D:\Data\Evosep runs\Liu\Zlatka_SUCDIA_B2_S1-D3...,2021-09-07 00:59:34,2021-09-07,Timppa,Liu,total proteome,DDAPASEF,Evosep 60 samples/day,...,"{""name"":""900"",""x"":[0,1,2,3,4,5,6,7,8,9,10,11,1...","{""355"": 11043891.372424724, ""356"": 11043891.37...",1.126971e+11,222,4.984190,102935.556701,2146166,"{""1"": 1769.0, ""2"": 1783.0, ""3"": 2096.0, ""4"": 1...","{""name"":""900"",""x"":[1,2,3,4,5,6,7,8,9,10,11,12,...","{""118"": 102935.55670103093, ""119"": 102935.5567..."
901,Zlatka_SUCDIA_B3,Zlatka_SUCDIA_B3,D:\Data\Evosep runs\Liu\Zlatka_SUCDIA_B3_S1-E3...,2021-09-07 01:23:17,2021-09-07,Timppa,Liu,total proteome,DDAPASEF,Evosep 60 samples/day,...,"{""name"":""901"",""x"":[0,1,2,3,4,5,6,7,8,9,10,11,1...","{""358"": 9998105.057097541, ""359"": 9998105.0570...",7.114009e+10,264,4.980237,70695.276190,2084337,"{""0"": 1009.0, ""1"": 6359.0, ""2"": 8074.0, ""3"": 1...","{""name"":""901"",""x"":[0,1,2,3,4,5,6,7,8,9,10,11,1...","{""56"": 70695.27619047619, ""57"": 70695.27619047..."
902,Zlatka_SUCPET_B1,Zlatka_SUCPET_B1,D:\Data\Evosep runs\Liu\Zlatka_SUCPET_B1_S1-F3...,2021-09-07 01:46:59,2021-09-07,Timppa,Liu,total proteome,DDAPASEF,Evosep 60 samples/day,...,"{""name"":""902"",""x"":[0,1,2,3,4,5,6,7,8,9,10,11,1...","{""236"": 10253074.071315372, ""238"": 10253074.07...",6.478522e+10,249,4.988142,64615.961965,2085059,"{""0"": 1239.0, ""1"": 10442.0, ""2"": 8730.0, ""3"": ...","{""name"":""902"",""x"":[0,1,2,3,4,5,6,7,8,9,10,11,1...","{""117"": 64615.96196513471, ""118"": 64615.961965..."
903,Zlatka_SUCPET_B2,Zlatka_SUCPET_B2,D:\Data\Evosep runs\Liu\Zlatka_SUCPET_B2_S1-G3...,2021-09-07 02:10:42,2021-09-07,Timppa,Liu,total proteome,DDAPASEF,Evosep 60 samples/day,...,"{""name"":""903"",""x"":[0,1,2,3,4,5,6,7,8,9,10,11,1...","{""234"": 12175248.716098335, ""235"": 12175248.71...",1.398630e+11,309,4.948617,134855.362620,2687208,"{""0"": 1520.0, ""1"": 1901.0, ""2"": 2746.0, ""3"": 1...","{""name"":""903"",""x"":[0,1,2,3,4,5,6,7,9,10,11,12,...","{""69"": 134855.3626198083, ""70"": 134855.3626198..."


In [270]:
ndiffs

9150

In [None]:
modifications = []
additions = []
for common_protein, data in common_proteins.items():
    if common_protein in have:
        cur_ver = current[current['uniprot_id']==common_protein]
        mods = cur_ver['protein_type'].split(', ')
        mods.extend(data[5])
        modifications.append([common_protein, 'protein_type', ', '.join(sorted(list(set(mods))))])
    else:
        data[5] = ', '.join(data[5])
        additions.append([com_cols, data])
db_conn = db_functions.create_connection(db_file)
for addcols, add_data in additions:
    db_functions.add_record(db_conn, 'common_proteins', addcols, add_data)