In [1]:
import sqlite3
import os
import pandas as pd
import json
import numpy as np
from datetime import datetime
from components.parsing import parse_parameters
from components.text_handling import replace_accent_and_special_characters
from dateutil.parser import parse
from plotly import graph_objects as go
from plotly import io as pio
from components import text_handling
import xmltodict
from components.api_tools.annotation import intact
from components.api_tools.annotation import biogrid
parameters = parse_parameters('parameters.json')
from components import db_functions
rlist = pd.read_excel('combined runlist.xlsx')
db_file = os.path.join('data','db','proteogyver.db')
db_conn = db_functions.create_connection(db_file)
current = db_functions.get_full_table_as_pd(db_conn, 'control_sets')
current_crap = db_functions.get_full_table_as_pd(db_conn, 'crapome_sets')
current_control_sets = {}
current_crapomes = {}
for cname in current['control_table_name']:
    current_control_sets[cname] = db_functions.get_full_table_as_pd(db_conn, cname)
for cname in current_crap['crapome_table_name']:
    current_crapomes[cname] = db_functions.get_full_table_as_pd(db_conn, cname)
db_conn.close()

In [2]:
dbdir = os.path.join('data','db')
datadir = os.path.join(dbdir,'db build files')
crapome = None
controls = None
jsons = {}
sets = {}
for f in os.listdir(datadir):
    if f.split('.')[-1]=='json':
        with open(os.path.join(datadir,f)) as fil:
            jsons[f'data_{f}'] = json.load(fil)
additional_controls_dir = os.path.join(datadir,'gfp control')
new_control_sets = {}
overall_setnames = {
    '202411 GD BioID': 'VL GFP MAC3 6h BioID',
    '202411 GD_loc BioID': 'VL GFP MAC3 6h BioID loc',
}
for setdir in os.listdir(additional_controls_dir):
    sampleinfo = pd.read_excel(os.path.join(additional_controls_dir, setdir, 'Sample_Information.xlsx'))
    data = pd.read_csv(os.path.join(additional_controls_dir, setdir, 'reprint.spc.tsv'),sep='\t', index_col='PROTID').drop(columns=['GENEID','PROTLEN'])
    data.rename(columns={c: replace_accent_and_special_characters(c,'_') for c in data.columns},inplace=True)
    data = data[(data.index.notna()) & (~data.index.isin({'na','NA'}))].astype(int).replace('na',np.nan).replace(0, np.nan).reset_index()
    expcol = sampleinfo.columns[-1]
    for exp in sampleinfo[expcol].unique():
        setname = f'{setdir} {exp}'
        if not setname in overall_setnames:
            continue
        big_setname = overall_setnames[setname]
        setname = replace_accent_and_special_characters(setname,'_')
        if big_setname not in sets:
            sets[big_setname] = []
        sets[big_setname].append(setname)
        if controls is not None:
            replace_names = {}
            for c in data.columns:
                if c == 'PROTID': continue
                if c in controls.columns:
                    i = 1
                    while f'{c}_{i}' in controls.columns:
                        i += 1
                    replace_names[c] = f'{c}_{i}'
            data.rename(columns=replace_names,inplace=True)
        jsons['data_control sets.json'][setname] = list(data.columns[1:])
        jsons['data_crapome sets.json'][setname] = list(data.columns[1:])
        print(f'added {setname}')
    if crapome is None:
        crapome = data
        controls = data
    else:
        crapome = crapome.merge(data,left_on='PROTID',right_on='PROTID',how='outer')
        controls = controls.merge(data,left_on='PROTID',right_on='PROTID',how='outer')


added 202411_gd_loc_bioid
added 202411_gd_bioid


In [3]:
def get_overall_table(dodf, columns):
    set_df = dodf.drop(columns=['PROTID']).replace(0,np.nan).dropna(how='all',axis=0).dropna(how='all',axis=1)
    nruns = set_df.shape[1]
    set_data = []
    for protid, row in set_df.iterrows():
        stdval = row.std()
        if pd.isna(stdval):
            stdval = -1
        set_data.append([protid, row.notna().sum(), row.notna().sum()/nruns,row.sum(), row.mean(), row.min(), row.max(), stdval])
    return pd.DataFrame(columns=columns, data=set_data)
columns = [
    'protein_id',
    'identified_in',
    'frequency',
    'spc_sum',
    'spc_avg',
    'spc_min',
    'spc_max',
    'spc_stdev']
types = [
    'TEXT PRIMARY KEY',
    'INTEGER NOT NULL',
    'REAL NOT NULL',
    'INTEGER NOT NULL',
    'REAL NOT NULL',
    'INTEGER NOT NULL',
    'INTEGER NOT NULL',
    'REAL NOT NULL'
]
new_overall_rows = []
crapome_new_overall_rows = []
new_tables = {}
drop_tables = []
for big_setname, small_setnames in sets.items():
    for setname in small_setnames:
        col_names = jsons['data_control sets.json'][setname]
        data = controls[['PROTID']+col_names]
        data = data.set_index('PROTID').dropna(how='all', axis=0).dropna(how='all', axis=1)  # drop empty rows/columns
        data.reset_index(inplace=True) 

        small_setname = f'control_{big_setname.lower()}'.replace(' ','_')
        if small_setname not in current_control_sets:
            new_overall_rows.append([small_setname, f'{big_setname} ({len(col_names)} runs)', len(col_names), 0, 0, small_setname])
        else:
            drop_tables.append(small_setname)
            drop_tables.append(small_setname+'_overall')
        new_tables[small_setname+'_overall'] = get_overall_table(data, columns)
        new_tables[small_setname] = data.rename(columns={c: 'CS_'+text_handling.replace_accent_and_special_characters(c,'_') for c in data.columns[1:]})

        small_setname = small_setname.replace('control','crapome')
        if small_setname not in current_crapomes:
            crapome_new_overall_rows.append([small_setname, f'{big_setname} ({len(col_names)} runs)', len(col_names), 0, 0, small_setname])
        else:
            drop_tables.append(small_setname)
        new_tables[small_setname] = get_overall_table(data, columns)
        
control_cols = ['control_set','control_set_name','runs','is_disabled','is_default','control_table_name']
crapome_cols = ['crapome_set','crapome_set_name','runs','is_disabled','is_default','crapome_table_name']
exts = ['TEXT PRIMARY KEY','TEXT NOT NULL','INTEGER NOT NULL','INTEGER NOT NULL','INTEGER NOT NULL','TEXT NOT NULL']

additions = []
for no in new_overall_rows:
    additions.append(['control_sets',control_cols, no])
for no in crapome_new_overall_rows:
    additions.append(['crapome_sets',crapome_cols, no])
table_create_sql = []
data_insert_sql = []
for tablename, data_table in new_tables.items():
    create_str = [
        f'CREATE TABLE IF NOT EXISTS  {tablename} (',
    ]
    if ('overall' in tablename) or ('crapome' in tablename):
        for i, c in enumerate(data_table.columns):
            create_str.append(f'    {c} {types[i]},',)
        for _, row in data_table.iterrows():
            add_str = [f'INSERT INTO {tablename} ({", ".join(data_table.columns)}) VALUES ({", ".join(["?" for _ in data_table.columns])})', tuple(row.values)]
            data_insert_sql.append(add_str)
    else:
        detailed_control_types = ['TEXT PRIMARY KEY']
        for c in data_table.columns[1:]:
            detailed_control_types.append('REAL')
        for i, c in enumerate(data_table.columns):
            create_str.append(f'    {c} {detailed_control_types[i]},',)
        for _, row in data_table.iterrows():
            add_str = [f'INSERT INTO {tablename} ({", ".join(data_table.columns)}) VALUES ({", ".join(["?" for _ in data_table.columns])})', tuple(row.values)]
            data_insert_sql.append(add_str)
    create_str = '\n'.join(create_str).strip(',')
    create_str += '\n);'

    table_create_sql.append(create_str)

In [4]:
db_conn = db_functions.create_connection(db_file)
if len(drop_tables) > 0:
    for tablename in drop_tables:
        db_functions.drop_table(db_conn, tablename)
    db_conn.commit()

for tablename, column_names, values in additions:
    db_functions.add_record(db_conn, tablename, column_names, values)

cursor = db_conn.cursor()
for create_table_str in table_create_sql:
    cursor.execute(create_table_str)
for insert_str, insert_data in data_insert_sql:
    cursor.execute(insert_str, insert_data)
db_conn.commit()
db_conn.close()
    

In [5]:


tables = db_functions.list_tables(db_file)
db_conn = db_functions.create_connection(db_file)
current = db_functions.get_full_table_as_pd(db_conn, 'control_sets')
current_crap = db_functions.get_full_table_as_pd(db_conn, 'crapome_sets')
current_control_sets = {}
current_crapomes = {}
for cname in current['control_table_name']:
    current_control_sets[cname] = db_functions.get_full_table_as_pd(db_conn, cname)
for cname in current_crap['crapome_table_name']:
    current_crapomes[cname] = db_functions.get_full_table_as_pd(db_conn, cname)
db_conn.close()

In [8]:
current

Unnamed: 0,control_set,control_set_name,runs,is_disabled,is_default,control_table_name
0,control_vl_gfp_mac3_10min_ap,VL GFP MAC3 10min AP (63 runs),62,0,1,control_vl_gfp_mac3_10min_ap
1,control_vl_gfp_mac3_10min_bioid,VL GFP MAC3 10min BioID (65 runs),64,0,1,control_vl_gfp_mac3_10min_bioid
2,control_vl_gfp_mac2_18h_ap,VL GFP MAC2 18h AP (17 runs),16,0,0,control_vl_gfp_mac2_18h_ap
3,control_vl_gfp_mac2_18h_bioid,VL GFP MAC2 18h BioID (21 runs),20,0,0,control_vl_gfp_mac2_18h_bioid
4,control_vl_gfp_mac_24h_ap,VL GFP MAC 24h AP (37 runs),36,0,1,control_vl_gfp_mac_24h_ap
5,control_vl_gfp_mac_24h_ap_nls,VL GFP MAC 24h AP NLS (18 runs),17,0,1,control_vl_gfp_mac_24h_ap_nls
6,control_vl_gfp_mac_24h_bioid,VL GFP MAC 24h BioID (39 runs),38,0,1,control_vl_gfp_mac_24h_bioid
7,control_vl_gfp_mac_24h_bioid_nls,VL GFP MAC 24h BioID NLS (22 runs),21,0,1,control_vl_gfp_mac_24h_bioid_nls
8,control_vl_gfp_mac3_6h_bioid_loc,VL GFP MAC3 6h BioID loc (27 runs),27,0,0,control_vl_gfp_mac3_6h_bioid_loc
9,control_vl_gfp_mac3_6h_bioid,VL GFP MAC3 6h BioID (33 runs),33,0,0,control_vl_gfp_mac3_6h_bioid


In [9]:
current_control_sets['control_vl_gfp_mac3_10min_bioid']

Unnamed: 0,PROTID,CS_18381_liu_mac3_c_gfp1_bio1_s1_c12_1_spc,CS_18382_liu_mac3_c_gfp2_bio1_s1_d1_1_spc,CS_18383_liu_mac3_c_gfp3_bio1_s1_d2_1_spc,CS_18385_liu_mac3_c_gfp4_bio1_s1_d4_1_spc,CS_18386_liu_mac3_c_gfp5_bio1_s1_d5_1_spc,CS_18387_liu_mac3_c_gfp6_bio1_s1_d6_1_spc,CS_18389_liu_mac3_c_gfp7_bio1_s1_d8_1_spc,CS_18390_liu_mac3_c_gfp8_bio1_s1_d9_1_spc,CS_18391_liu_mac3_c_gfp9_bio1_s1_d10_1_spc,...,CS_suc_n_gfp_bio10_s1_c2_1_4443,CS_suc_n_gfp_bio1_s3_h1_1_3637,CS_suc_n_gfp_bio2_s3_h2_1_3638,CS_suc_n_gfp_bio3_s3_h3_1_3639,CS_suc_n_gfp_bio4_s3_h4_1_3640,CS_suc_n_gfp_bio5_s3_h5_1_3641,CS_suc_n_gfp_bio6_s3_h6_1_3642,CS_suc_n_gfp_bio7_s3_h7_1_3643,CS_suc_n_gfp_bio8_s3_h8_1_3644,CS_suc_n_gfp_bio9_s1_c1_1_4442
0,A0A024RBG1,,,2.0,,,2.0,,,,...,,,,,,,,,,
1,A0A0B4J2D5,5.0,1.0,3.0,,4.0,3.0,3.0,2.0,,...,,,,,,,,,,
2,A0A3B3IU46,,,,,1.0,,,,,...,,,,,,,,,,
3,A0A804HLA8,,,,,,,,,,...,,,,,,,,,,
4,A0AV96,,,,,,,,,,...,,,,,2.0,,,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6626,Q9Y6X9,1.0,1.0,3.0,1.0,1.0,2.0,2.0,1.0,,...,,,,,1.0,,,,,
6627,Q9Y6Y0,1.0,3.0,,,1.0,,1.0,2.0,2.0,...,,,,,,1.0,,,,
6628,Q9Y6Y8,8.0,9.0,11.0,5.0,8.0,12.0,10.0,9.0,8.0,...,,4.0,2.0,1.0,2.0,1.0,1.0,2.0,3.0,
6629,SH-TAG,47.0,54.0,54.0,44.0,52.0,50.0,47.0,39.0,49.0,...,34.0,30.0,33.0,32.0,31.0,37.0,33.0,37.0,32.0,55.0


In [10]:
[a[-1] for a in additions]

[['control_vl_gfp_mac3_6h_bioid_loc',
  'VL GFP MAC3 6h BioID loc (27 runs)',
  27,
  0,
  0,
  'control_vl_gfp_mac3_6h_bioid_loc'],
 ['control_vl_gfp_mac3_6h_bioid',
  'VL GFP MAC3 6h BioID (33 runs)',
  33,
  0,
  0,
  'control_vl_gfp_mac3_6h_bioid'],
 ['crapome_vl_gfp_mac3_6h_bioid_loc',
  'VL GFP MAC3 6h BioID loc (27 runs)',
  27,
  0,
  0,
  'crapome_vl_gfp_mac3_6h_bioid_loc'],
 ['crapome_vl_gfp_mac3_6h_bioid',
  'VL GFP MAC3 6h BioID (33 runs)',
  33,
  0,
  0,
  'crapome_vl_gfp_mac3_6h_bioid']]

In [11]:
tables

['control_sets',
 'crapome_sets',
 'proteins',
 'control_vl_gfp_mac3_10min_ap_overall',
 'control_vl_gfp_mac3_10min_ap',
 'control_vl_gfp_mac3_10min_bioid_overall',
 'control_vl_gfp_mac3_10min_bioid',
 'control_vl_gfp_mac2_18h_ap_overall',
 'control_vl_gfp_mac2_18h_ap',
 'control_vl_gfp_mac2_18h_bioid_overall',
 'control_vl_gfp_mac2_18h_bioid',
 'control_vl_gfp_mac_24h_ap_overall',
 'control_vl_gfp_mac_24h_ap',
 'control_vl_gfp_mac_24h_ap_nls_overall',
 'control_vl_gfp_mac_24h_ap_nls',
 'control_vl_gfp_mac_24h_bioid_overall',
 'control_vl_gfp_mac_24h_bioid',
 'control_vl_gfp_mac_24h_bioid_nls_overall',
 'control_vl_gfp_mac_24h_bioid_nls',
 'crapome_vl_gfp_mac3_10min_ap',
 'crapome_vl_gfp_mac3_10min_bioid',
 'crapome_vl_gfp_mac2_18h_ap',
 'crapome_vl_gfp_mac2_18h_bioid',
 'crapome_vl_gfp_mac_24h_ap',
 'crapome_vl_gfp_mac_24h_ap_nls',
 'crapome_vl_gfp_mac_24h_bioid',
 'crapome_vl_gfp_mac_24h_bioid_nls',
 'crapome_nesvilab',
 'contaminants',
 'ms_runs',
 'known_interactions',
 'msmicrosco

: 