In [1]:
import os
import json
import pandas as pd
import json
from components import db_functions
from components.parsing import parse_parameters
parameters = parse_parameters('parameters.json')
rlist = pd.read_excel('combined runlist.xlsx')
db_file = os.path.join('data','db','proteogyver.db')

In [6]:
from datetime import datetime
def get_mstable_insert(ms_cols, ms_run_datadir, runlist, time_format,runs_done=None,banned_run_dirs = None):
    ms_runs_insert_sql = []
    data_to_enter = []
    failed_json_files = []
    if banned_run_dirs is None:
        banned_run_dirs = [
            'BRE_20_xxxxx_Helsinki',
            'TrapTrouble_3'
        ]
    if runs_done is None:
        runs_done = set()
    for i, datafilename in enumerate(os.listdir(ms_run_datadir)):
        if i % 100 == 0:
            print(i, f'Success: {len(data_to_enter)}, Failed: {len(failed_json_files)}')
        with open(os.path.join(ms_run_datadir, datafilename)) as fil:
            try:
                dat = json.load(fil)
            except json.JSONDecodeError:
                failed_json_files.append(['json decode error', datafilename, ''])
                continue
        if 'polarity_1_sers' in dat:
            del dat['polarity_1_sers']
        if dat['SampleID'] in runs_done: 
            failed_json_files.append(['SampleID in done', datafilename, dat])
            continue
        if dat['SampleInfo'] == ['']:
            failed_json_files.append(['no sample info',datafilename, dat])
            continue
        banned = False
        for b in banned_run_dirs:
            if b in dat['SampleInfo']['SampleTable']['AnalysisHeader']['@FileName']:
                banned = True
        if banned:
            failed_json_files.append(['banned',datafilename, dat])
            continue
        runs_done.add(dat['SampleID'])
        lc_method = None
        ms_method = None
        if isinstance(dat['SampleInfo'], list):
            failed_json_files.append(['no sample info',datafilename, dat])
            continue
        if not 'polarity_1' in dat:
            failed_json_files.append(['no polarity',datafilename, dat])
            continue
        for propdic in dat['SampleInfo']['SampleTable']['SampleTableProperties']['Property']:
            if propdic['@Name'] == 'HyStar_LC_Method_Name':
                lc_method = propdic['@Value']
            if propdic['@Name'] == 'HyStar_MS_Method_Name':
                ms_method = propdic['@Value']
        sample_names = {
            dat['SampleInfo']['SampleTable']['Sample']['@SampleID'],
            dat['SampleInfo']['SampleTable']['Sample']['@SampleID']+'.d',
            dat['SampleInfo']['SampleTable']['Sample']['@DataPath'],
        }
        samplerow = runlist[runlist['Raw file'].isin(sample_names)]
        if (lc_method is None) or (ms_method is None):
            failed_json_files.append(['lc or ms method is none', datafilename, ''])
            continue
        if len([k for k in dat.keys() if 'polarity' in k]) > 1:
            failed_json_files.append(['too much polarity in keys', datafilename, ''])
            continue
        if samplerow.shape[0] == 0:
            samplerow = pd.Series(index = samplerow.columns, data = ['No data' for c in samplerow.columns])
        else:
            samplerow = samplerow.iloc[0]
        instrument = 'TimsTOF 1'
        runtime = datetime.strftime(
            datetime.strptime(
                dat['SampleInfo']['SampleTable']['AnalysisHeader']['@CreationDateTime'].split('+')[0],
                '%Y-%m-%dT%H:%M:%S'
            ),
            time_format
        )    
        samplename = samplerow['Sample name']
        author = samplerow['Who']
        sample_type = samplerow['Sample type']
        bait = samplerow['Bait name']
        bait_uniprot = samplerow['Bait / other uniprot or ID']
        bait_mut = samplerow['Bait mutation']
        cell_line = samplerow['Cell line / material']
        project = samplerow['Project']
        author_notes = samplerow['Notes']
        bait_tag = samplerow['tag']
        try:
            precur = dat['NumPrecursors']
        except KeyError:
            precur = 'No precursor data'
        ms_run_row = [
            dat['SampleID'],
            dat['SampleInfo']['SampleTable']['AnalysisHeader']['@SampleID'],
            samplename,
            dat['SampleInfo']['SampleTable']['AnalysisHeader']['@FileName'],
            runtime,
            runtime.split()[0],
            instrument,
            author,
            sample_type,
            dat['DataType'],
            lc_method,
            ms_method,
            precur,
            bait,
            bait_uniprot,
            bait_mut,
            len(pd.Series(dat['polarity_1']['tic df']['Series'])),
            cell_line,
            project,
            author_notes,
            bait_tag
        ]
        for dataname in ['bpc filtered df', 'tic df', 'bpc unfiltered df']:
            ms_run_row.extend([
                dat['polarity_1'][dataname]['auc'],
                dat['polarity_1'][dataname]['intercepts'],
                dat['polarity_1'][dataname]['peaks_per_timepoint'],
                dat['polarity_1'][dataname]['mean_intensity'],
                dat['polarity_1'][dataname]['max_intensity'],
                json.dumps(dat['polarity_1'][dataname]['Series']),
                dat['polarity_1'][dataname]['trace'],
                json.dumps(dat['polarity_1'][dataname]['intercept_dict']),
            ])   
            for smooth in ['smooth3','smooth6','smooth12','smooth20','smooth30']:
                ms_run_row.append(dat['polarity_1'][dataname][f'trace_{smooth}'])
        
        data_to_enter.append(ms_run_row)
    for data in data_to_enter:
        add_str = f'INSERT INTO ms_runs ({", ".join([c.split()[0] for c in ms_cols])}) VALUES ({", ".join(["?" for _ in ms_cols])})'
        ms_runs_insert_sql.append([add_str, data])
    return (ms_runs_insert_sql, failed_json_files, data_to_enter)

In [7]:
ms_cols = [
    'run_id TEXT PRIMARY KEY',
    'run_name TEXT NOT NULL',
    'sample_name TEXT NOT NULL',
    'file_name TEXT NOT NULL',
    'run_time TEXT NOT NULL',
    'run_date TEXT NOT NULL',
    'instrument TEXT NOT NULL',
    'author TEXT NOT NULL',
    'sample_type TEXT NOT NULL',
    'run_type TEXT NOT NULL',
    'lc_method TEXT NOT NULL',
    'ms_method TEXT NOT NULL',
    'num_precursors INTEGER NOT NULL',
    'bait TEXT',
    'bait_uniprot TEXT',
    'bait_mutation TEXT',
    'chromatogram_max_time INTEGER NOT NULL',
    'cell_line_or_material TEXT',
    'project TEXT',
    'author_notes TEXT',
    'bait_tag TEXT'
]
keytypes = {
    'auc': 'REAL NOT NULL',
    'intercepts': 'INTEGER NOT NULL',
    'avg_peaks_per_timepoint': 'REAL NOT NULL',
    'mean_intensity': 'INTEGER NOT NULL',
    'max_intensity': 'INTEGER NOT NULL',
    'json': 'TEXT NOT NULL',
    'trace': 'TEXT NOT NULL', 
    'intercept_json': 'TEXT NOT NULL'
}
dfcols = [n[0] for n in ms_cols]
for typ in ['MSn_filtered','TIC','MSn_unfiltered']:
    for key in ['auc','intercepts','avg_peaks_per_timepoint','mean_intensity','max_intensity', 'json','trace', 'intercept_json']:
        ms_cols.append(f'{typ.lower()}_{key.lower()} {keytypes[key]}')
        dfcols.append(f'{typ.lower()}_{key.lower()}')
    for smooth in ['smooth3','smooth6','smooth12','smooth20','smooth30']:
        ms_cols.append(f'{typ.lower()}_trace_{smooth.lower()} {keytypes['trace']}')
        dfcols.append(f'{typ.lower()}_trace_{smooth.lower()}')
rundata_dir = '/media/kmsaloka/Expansion/20241118_parse/ms_runs'
rlist = pd.read_excel('combined runlist.xlsx')
new_dat, new_failed_fils, plain_data = get_mstable_insert(
    ms_cols, 
    rundata_dir,
    rlist,
    parameters['Config']['Time format']
)

0 Success: 0, Failed: 0
100 Success: 100, Failed: 0
200 Success: 200, Failed: 0
300 Success: 300, Failed: 0
400 Success: 400, Failed: 0
500 Success: 499, Failed: 1
600 Success: 599, Failed: 1
700 Success: 699, Failed: 1
800 Success: 799, Failed: 1
900 Success: 899, Failed: 1
1000 Success: 999, Failed: 1
1100 Success: 1099, Failed: 1
1200 Success: 1199, Failed: 1
1300 Success: 1299, Failed: 1
1400 Success: 1399, Failed: 1
1500 Success: 1499, Failed: 1
1600 Success: 1599, Failed: 1
1700 Success: 1699, Failed: 1
1800 Success: 1799, Failed: 1
1900 Success: 1899, Failed: 1
2000 Success: 1999, Failed: 1
2100 Success: 2099, Failed: 1
2200 Success: 2199, Failed: 1
2300 Success: 2299, Failed: 1
2400 Success: 2399, Failed: 1
2500 Success: 2499, Failed: 1
2600 Success: 2599, Failed: 1
2700 Success: 2699, Failed: 1
2800 Success: 2796, Failed: 4
2900 Success: 2893, Failed: 7
3000 Success: 2993, Failed: 7
3100 Success: 3093, Failed: 7
3200 Success: 3193, Failed: 7
3300 Success: 3293, Failed: 7
3400 

In [18]:

dfcols = [n.split()[0] for n in ms_cols]
for typ in ['MSn_filtered','TIC','MSn_unfiltered']:
    for key in ['auc','intercepts','avg_peaks_per_timepoint','mean_intensity','max_intensity', 'json','trace', 'intercept_json']:
        if f'{typ.lower()}_{key.lower()}' not in dfcols:
            dfcols.append(f'{typ.lower()}_{key.lower()}')
    for smooth in ['smooth3','smooth6','smooth12','smooth20','smooth30']:
        if f'{typ.lower()}_trace_{smooth.lower()}' not in dfcols:
            dfcols.append(f'{typ.lower()}_trace_{smooth.lower()}')
ndf = pd.DataFrame(columns=dfcols, data=plain_data)

In [None]:

db_conn = db_functions.create_connection(db_file)
current = db_functions.get_full_table_as_pd(db_conn, 'ms_runs')
db_conn.close()

In [None]:

modifications = []
additions = []
for index, row in ndf[~ndf.index.isin(current.index)].iterrows():
    additions.append([ms_cols, [index] + [row[c] for c in dfcols[1:]]])
for colname, indexes in differences.items():
    for index in indexes:
        modifications.append([index, colname, ndf.loc[index][colname]])
db_conn = db_functions.create_connection(db_file)
for index, colname, newval in modifications:
    db_functions.modify_record(db_conn, 'ms_runs','run_id',index,[colname],[newval])
for addcols, add_data in additions:
    db_functions.add_record(db_conn, 'ms_runs', addcols, add_data)
db_conn.commit()
db_conn.close()

In [6]:

db_conn = db_functions.create_connection(db_file)
df = db_functions.get_full_table_as_pd(db_conn, 'known_interactions')
db_conn.close()

In [28]:
tomap = pd.read_csv(os.path.join('external','SAINTexpress','list.txt'),sep='\t')
df = df[df['uniprot_id_a'].isin(tomap['Prey'])]
df = df[df['uniprot_id_b'].isin(tomap['Prey'])]
tomap2 = {up:gn for up,gn in zip(tomap['Prey'],tomap['PreyGene'])}
gs = [[],[]]
for _, row in df.iterrows():
    gs[0].append(tomap2[row['uniprot_id_a']])
    gs[1].append(tomap2[row['uniprot_id_b']])
df['gene1'] = gs[0]
df['gene2'] = gs[1]
df['interaction2'] = df['uniprot_id_a']+' '+df['uniprot_id_b']
df[['interaction','interaction2']].rename(columns={'interaction': 'GOID','interaction2':'EntrezGeneID'}).to_csv('SAINT_test.tsv',sep='\t',index=False)
df[['interaction','interaction2']].rename(columns={'interaction': 'GOID','interaction2':'EntrezGeneID'}).to_csv('SAINT_test2.tsv',sep=',',index=False)

In [33]:
df[(df['uniprot_id_a']=='P62736') & (df['uniprot_id_b']=='A5A3E0')]

Unnamed: 0,interaction,uniprot_id_a,uniprot_id_b,uniprot_id_a_noiso,uniprot_id_b_noiso,source_database,isoform_a,isoform_b,experimental_role_interactor_a,interaction_detection_method,...,interaction_type,experimental_role_interactor_b,annotation_interactor_b,biological_role_interactor_a,publication_count,notes,update_time,interaction2,gene1,gene2
1263634,P62736_-_A5A3E0,P62736,A5A3E0,P62736,A5A3E0,BIOGRID,,,,Cross-Linking-MS (XL-MS),...,physical,,,,1,Experiment throughput from BioGRID: High Throu...,BioGRID:2024-11-21,P62736 A5A3E0,ACTA2,POTEF


In [31]:
df[['interaction','interaction2']]

Unnamed: 0,interaction,interaction2
23,Q99961_-_P50570,Q99961 P50570
25,Q99961_-_Q96B97,Q99961 Q96B97
27,Q99961_-_Q99961,Q99961 Q99961
33,Q99961_-_Q96FW1,Q99961 Q96FW1
37,Q99961_-_O14672,Q99961 O14672
...,...,...
2671770,Q9Y6Y8_-_Q15436,Q9Y6Y8 Q15436
2671771,Q9Y6Y8_-_Q15437,Q9Y6Y8 Q15437
2671786,Q9Y6Y8_-_Q9Y2G8,Q9Y6Y8 Q9Y2G8
2671787,Q9Y6Y8_-_Q9Y547,Q9Y6Y8 Q9Y547


In [8]:
df[['interaction','uniprot_id_a','uniprot_id_b']]

Unnamed: 0,interaction,uniprot_id_a,uniprot_id_b
0,O43426_-_Q6ZQ03,O43426,Q6ZQ03
1,O43426_-_Q9Z0W5,O43426,Q9Z0W5
2,O43426_-_P62994,O43426,P62994
3,O43426_-_P17427,O43426,P17427
4,O43426_-_P18484,O43426,P18484
...,...,...,...
2673547,Q9ZWS9_-_Q9SAZ5,Q9ZWS9,Q9SAZ5
2673548,Q9ZWS9_-_Q9ZNV8,Q9ZWS9,Q9ZNV8
2673549,Q9ZWS9_-_Q9ZNV9,Q9ZWS9,Q9ZNV9
2673550,Q9ZWS9_-_Q9ZVB5,Q9ZWS9,Q9ZVB5


In [7]:
df

Unnamed: 0,interaction,uniprot_id_a,uniprot_id_b,uniprot_id_a_noiso,uniprot_id_b_noiso,source_database,isoform_a,isoform_b,experimental_role_interactor_a,interaction_detection_method,...,biological_role_interactor_b,annotation_interactor_a,confidence_value,interaction_type,experimental_role_interactor_b,annotation_interactor_b,biological_role_interactor_a,publication_count,notes,update_time
0,O43426_-_Q6ZQ03,O43426,Q6ZQ03,O43426,Q6ZQ03,IntAct,,,"psi-mi:""MI:0496""(bait);psi-mi:""MI:0496""(bait)","psi-mi:""MI:0096""(pull down);psi-mi:""MI:0084""(p...",...,"psi-mi:""MI:0499""(unspecified role);psi-mi:""MI:...","comment:""Stoichiometry: 1.0"";comment:""Stoichio...",intact-miscore:0.56;intact-miscore:0.56,"psi-mi:""MI:0407""(direct interaction);psi-mi:""M...","psi-mi:""MI:0498""(prey);psi-mi:""MI:0498""(prey)","comment:""Stoichiometry: 1.0"";comment:""Stoichio...","psi-mi:""MI:0499""(unspecified role);psi-mi:""MI:...",1,,IntAct:2024-11-21
1,O43426_-_Q9Z0W5,O43426,Q9Z0W5,O43426,Q9Z0W5,IntAct,,,"psi-mi:""MI:0496""(bait);psi-mi:""MI:0496""(bait)","psi-mi:""MI:0096""(pull down);psi-mi:""MI:0084""(p...",...,"psi-mi:""MI:0499""(unspecified role);psi-mi:""MI:...","comment:""Stoichiometry: 1.0"";comment:""Stoichio...",intact-miscore:0.56;intact-miscore:0.56,"psi-mi:""MI:0407""(direct interaction);psi-mi:""M...","psi-mi:""MI:0498""(prey);psi-mi:""MI:0498""(prey)","comment:""Stoichiometry: 1.0"";comment:""Stoichio...","psi-mi:""MI:0499""(unspecified role);psi-mi:""MI:...",1,,IntAct:2024-11-21
2,O43426_-_P62994,O43426,P62994,O43426,P62994,IntAct,,,"psi-mi:""MI:0496""(bait);psi-mi:""MI:0496""(bait)","psi-mi:""MI:0096""(pull down);psi-mi:""MI:0084""(p...",...,"psi-mi:""MI:0499""(unspecified role);psi-mi:""MI:...","comment:""Stoichiometry: 1.0"";comment:""Stoichio...",intact-miscore:0.56;intact-miscore:0.56,"psi-mi:""MI:0407""(direct interaction);psi-mi:""M...","psi-mi:""MI:0498""(prey);psi-mi:""MI:0498""(prey)","comment:""Stoichiometry: 1.0"";comment:""Stoichio...","psi-mi:""MI:0499""(unspecified role);psi-mi:""MI:...",1,,IntAct:2024-11-21
3,O43426_-_P17427,O43426,P17427,O43426,P17427,IntAct,,,"psi-mi:""MI:0496""(bait)","psi-mi:""MI:0096""(pull down)",...,"psi-mi:""MI:0499""(unspecified role)","comment:""Stoichiometry: 1.0""",intact-miscore:0.44,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0498""(prey)","comment:""Stoichiometry: 1.0""","psi-mi:""MI:0499""(unspecified role)",1,,IntAct:2024-11-21
4,O43426_-_P18484,O43426,P18484,O43426,P18484,IntAct,,,"psi-mi:""MI:0496""(bait)","psi-mi:""MI:0096""(pull down)",...,"psi-mi:""MI:0499""(unspecified role)","comment:""Stoichiometry: 1.0""",intact-miscore:0.40,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0498""(prey)","comment:""Stoichiometry: 1.0""","psi-mi:""MI:0499""(unspecified role)",1,,IntAct:2024-11-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2673547,Q9ZWS9_-_Q9SAZ5,Q9ZWS9,Q9SAZ5,Q9ZWS9,Q9SAZ5,IntAct;BIOGRID,,,"psi-mi:""MI:0498""(prey);psi-mi:""MI:0496""(bait)","psi-mi:""MI:0018""(two hybrid);psi-mi:""MI:0018""(...",...,"psi-mi:""MI:0499""(unspecified role);psi-mi:""MI:...",,intact-miscore:0.55;intact-miscore:0.55,"psi-mi:""MI:0915""(physical association);psi-mi:...","psi-mi:""MI:0496""(bait);psi-mi:""MI:0498""(prey)",,"psi-mi:""MI:0499""(unspecified role);psi-mi:""MI:...",3,Experiment throughput from BioGRID: High Throu...,IntAct:2024-11-21__BioGRID:2024-11-21
2673548,Q9ZWS9_-_Q9ZNV8,Q9ZWS9,Q9ZNV8,Q9ZWS9,Q9ZNV8,IntAct;BIOGRID,,,"psi-mi:""MI:0496""(bait)","psi-mi:""MI:0018""(two hybrid)__Two-hybrid",...,"psi-mi:""MI:0499""(unspecified role)",,intact-miscore:0.37,"psi-mi:""MI:0915""(physical association)__physical","psi-mi:""MI:0498""(prey)",,"psi-mi:""MI:0499""(unspecified role)",2,Experiment throughput from BioGRID: High Throu...,IntAct:2024-11-21__BioGRID:2024-11-21
2673549,Q9ZWS9_-_Q9ZNV9,Q9ZWS9,Q9ZNV9,Q9ZWS9,Q9ZNV9,IntAct;BIOGRID,,,"psi-mi:""MI:0498""(prey);psi-mi:""MI:0496""(bait)","psi-mi:""MI:0018""(two hybrid);psi-mi:""MI:0018""(...",...,"psi-mi:""MI:0499""(unspecified role);psi-mi:""MI:...",,intact-miscore:0.55;intact-miscore:0.55,"psi-mi:""MI:0915""(physical association);psi-mi:...","psi-mi:""MI:0496""(bait);psi-mi:""MI:0498""(prey)",,"psi-mi:""MI:0499""(unspecified role);psi-mi:""MI:...",3,Experiment throughput from BioGRID: High Throu...,IntAct:2024-11-21__BioGRID:2024-11-21
2673550,Q9ZWS9_-_Q9ZVB5,Q9ZWS9,Q9ZVB5,Q9ZWS9,Q9ZVB5,IntAct;BAR,,,"psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:2277""(Cr-two hybrid)__Two-hybrid",...,"psi-mi:""MI:0499""(unspecified role)",,intact-miscore:0.37,"psi-mi:""MI:0915""(physical association)__physical","psi-mi:""MI:0499""(unspecified role)",,"psi-mi:""MI:0499""(unspecified role)",1,Experiment throughput from BioGRID: High Throu...,IntAct:2024-11-21__BioGRID:2024-11-21
