In [1]:
import sqlite3
import os
import pandas as pd
import json
import numpy as np
from datetime import datetime

In [2]:
jsons = {}
for root, _, files in os.walk('data'):
    if '.ipynb' in root: continue
    for f in files:
        if '.json' in f:
            with open(os.path.join(root,f)) as fil:
                tpkey = '-'.join(root.split(os.sep))
                tpkey += f'_{f}'
                jsons[tpkey] = json.load(fil)

In [3]:
crapome = pd.read_csv(os.path.join('data','crapome table.tsv'),sep='\t')
controls = pd.read_csv(os.path.join('data','control table.tsv'),sep='\t')

In [4]:
sets = {
    'VL GFP MAC3': [
             'VL GFP MAC3-N AP-MS',
             'VL GFP MAC3-N BioID'
    ],
    'VL GFP MAC2': [
             'VL GFP MAC2-C AP-MS',
             'VL GFP MAC2-N AP-MS',
             'VL GFP MAC2-C BioID',
             'VL GFP MAC2-N BioID'
    ],
    'VL GFP MAC': [
            'VL GFP MAC-C AP-MS',
            'VL GFP MAC-MED-NLS AP-MS',
            'VL GFP MAC-MYC-NLS AP-MS',
            'VL GFP MAC-NLS AP-MS',
            'VL GFP MAC-N AP-MS',
            'VL GFP MAC-C BioID',
            'VL GFP MAC-MED-NLS BioID',
            'VL GFP MAC-MYC-NLS BioID',
            'VL GFP MAC-NLS BioID',
            'VL GFP MAC-N BioID'
    ],
    'Nesvilab': ['nesvilab']
}

In [5]:
crapome_tables = {}
columns = [
    'protein_id',
    'identified_in',
    'frequency',
    'spc_sum',
    'spc_avg',
    'spc_min',
    'spc_max',
    'spc_stdev']
types = [
    'TEXT PRIMARY KEY',
    'INTEGER NOT NULL',
    'REAL NOT NULL',
    'INTEGER NOT NULL',
    'REAL NOT NULL',
    'INTEGER NOT NULL',
    'INTEGER NOT NULL',
    'REAL NOT NULL'
]
crapome_entries = []
for setname, setcols in sets.items():
    all_cols = ['PROTID']
    defa = 1
    if 'MAC2' in setname: defa = 0
    tablename = f'crapome_{setname}'.lower().replace(' ','_')
    for sc in setcols:
        all_cols.extend(jsons['data_crapome sets.json'][sc])
    all_cols = sorted(list(set(all_cols)))
    set_df = crapome[all_cols]
    set_df.index = set_df['PROTID']
    set_df = set_df.drop(columns=['PROTID']).replace(0,np.nan).dropna(how='all',axis=0).dropna(how='all',axis=1)
    nruns = set_df.shape[1]
    set_data = []
    for protid, row in set_df.iterrows():
        stdval = row.std()
        if pd.isna(stdval):
            stdval = -1
        set_data.append([protid, row.notna().sum(), row.notna().sum()/nruns,row.sum(), row.mean(), row.min(), row.max(), stdval])
    crapome_tables[tablename] = pd.DataFrame(columns=columns, data=set_data)
    crapome_entries.append([tablename, setname, nruns, 0, defa, tablename])
control_tables = {}
control_entries = []
for setname, setcols in sets.items():
    if setname == 'Nesvilab': continue
    all_cols = ['PROTID']
    defa = 1
    if 'MAC2' in setname: defa = 0
    tablename = f'control_{setname}'.lower().replace(' ','_')
    for sc in setcols:
        all_cols.extend(jsons['data_control sets.json'][sc])
    all_cols = sorted(list(set(all_cols)))
    set_df = controls[all_cols]
    set_df.index = set_df['PROTID']
    set_df = set_df.drop(columns=['PROTID']).replace(0,np.nan).dropna(how='all',axis=0).dropna(how='all',axis=1)
    nruns = set_df.shape[1]
    set_data = []
    for protid, row in set_df.iterrows():
        stdval = row.std()
        if pd.isna(stdval):
            stdval = -1
        set_data.append([protid, row.notna().sum(), row.notna().sum()/nruns,row.sum(), row.mean(), row.min(), row.max(), stdval])
    control_tables[tablename] = pd.DataFrame(columns=columns, data=set_data)
    control_entries.append([tablename, setname, nruns, 0, defa, tablename])


In [26]:
control_cols = ['control_set','control_set_name','runs','is_disabled','is_default','control_table_name']
crapome_cols = ['crapome_set','crapome_set_name','runs','is_disabled','is_default','crapome_table_name']
exts = ['TEXT PRIMARY KEY','TEXT NOT NULL','INTEGER NOT NULL','INTEGER NOT NULL','INTEGER NOT NULL','TEXT NOT NULL']

control_table_str =  [
        f'CREATE TABLE control_sets (',
    ]
for i, c in enumerate(control_cols):
    control_table_str.append(f'    {c} {exts[i]},',)
control_table_str = '\n'.join(control_table_str).strip(',')
control_table_str += '\n);'

crapome_table_str =  [
        f'CREATE TABLE crapome_sets (',
    ]
for i, c in enumerate(crapome_cols):
    crapome_table_str.append(f'    {c} {exts[i]},',)
crapome_table_str = '\n'.join(crapome_table_str).strip(',')
crapome_table_str += '\n);'

prot_cols = [
    'uniprot_id',
    'is_reviewed',
    'gene_name',
    'entry_name',
    'all_gene_names',
    'organism',
    'length',
    'sequence',
    'is_latest',
    'entry_source',
    'update_time'
]
prot_exts = [
    'TEXT PRIMARY KEY',
    'INTEGER NOT NULL',
    'TEXT NOT NULL',
    'TEXT NOT NULL',
    'TEXT NOT NULL',
    'TEXT NOT NULL',
    'INTEGER NOT NULL',
    'TEXT NOT NULL',
    'INTEGER NOT NULL',
    'TEXT NOT NULL',
    'TEXT NOT NULL'
]

prot_table_str =  [
        f'CREATE TABLE proteins (',
    ]
for i, c in enumerate(prot_cols):
    prot_table_str.append(f'    {c} {prot_exts[i]},',)
prot_table_str = '\n'.join(prot_table_str).strip(',')
prot_table_str += '\n);'

table_create_sql = [control_table_str, crapome_table_str, prot_table_str]

insert_sql = []

for vals in control_entries:
    tablename = vals[0]
    create_str = [
        f'CREATE TABLE {tablename} (',
    ]
    for i, c in enumerate(columns):
        create_str.append(f'    {c} {types[i]},',)
    create_str = '\n'.join(create_str).strip(',')
    create_str += '\n);'
    table_create_sql.append(create_str)
    add_str = [f'INSERT INTO control_sets ({", ".join(control_cols)}) VALUES ({", ".join(["?" for _ in control_cols])})', vals]
    insert_sql.append(add_str)
    for _, row in control_tables[tablename].iterrows():
        add_str = [f'INSERT INTO {tablename} ({", ".join(columns)}) VALUES ({", ".join(["?" for _ in columns])})', tuple(row.values)]
        insert_sql.append(add_str)
print(len(insert_sql))

for vals in crapome_entries:
    tablename = vals[0]
    create_str = [
        f'CREATE TABLE {tablename} (',
    ]
    for i, c in enumerate(columns):
        create_str.append(f'    {c} {types[i]},',)
    create_str = '\n'.join(create_str).strip(',')
    create_str += '\n);'
    table_create_sql.append(create_str)
    add_str = [f'INSERT INTO crapome_sets ({", ".join(crapome_cols)}) VALUES({", ".join(["?" for _ in crapome_cols])})', vals]
    insert_sql.append(add_str)
    for _, row in crapome_tables[tablename].iterrows():
        add_str = [f'INSERT INTO {tablename} ({", ".join(columns)}) VALUES ({", ".join(["?" for _ in columns])})', tuple(row.values)]
        insert_sql.append(add_str)
print(len(insert_sql))

prot = pd.read_csv('uniprotkb_taxonomy_id_7711_AND_reviewed_2023_09_04.tsv',sep='\t',index_col = 'Entry')
for protid, row in prot.iterrows():
    gn = row['Gene Names (primary)']
    if pd.isna(gn):
        gn = row['Entry Name']
    gns = row['Gene Names']
    if pd.isna(gns):
        gns = row['Entry Name']
    data = [
        protid,
        int(row['Reviewed']=='reviewed'),
        gn,
        row['Entry Name'],
        gns,
        row['Organism'],
        row['Length'],
        row['Sequence'],
        1,
        'uniprot_initial_download',
        datetime.today().strftime('%Y-%m-%d')
    ]
    add_str = f'INSERT INTO proteins ({", ".join(prot_cols)}) VALUES ({", ".join(["?" for _ in prot_cols])})'
    insert_sql.append([add_str, data])
print(len(insert_sql))

17964
66552
153399


In [12]:
# Connect to the database (create it if it doesn't exist)
conn = sqlite3.connect('proteogyver.db')
# Create a cursor object
cursor = conn.cursor()

In [13]:
start = datetime.now()
for create_table_str in table_create_sql:
    cursor.execute(create_table_str)
for insert_str, insert_data in insert_sql:
    cursor.execute(insert_str, insert_data)
print((datetime.now() - start).seconds)

0


In [22]:
tictable_create = ['CREATE TABLE IF NOT EXISTS ms_runs (']
tic_cols = [
    'run_id TEXT PRIMARY KEY',
    'sample_id TEXT NOT NULL',
    'run_name TEXT NOT NULL',
    'run_time TEXT NOT NULL',
    'instrument TEXT NOT NULL',
    'author TEXT NOT NULL',
    'sample_type TEXT NOT NULL',
    'run_type TEXT NOT NULL',
    'bait TEXT',
    'bait_uniprot TEXT',
    'tic_max_time INTEGER NOT NUL',
]
for col in tic_cols:
    tictable_create.append(f'    {col},')
    
for col in  [
    'tic_auc REAL NOT NULL',
    'tic_intercepts INTEGER NOT NULL',
    'avg_peaks_per_timepoint REAL NOT NULL',
    'mean_intensity INTEGER NOT NULL',
    'max_intensity INTEGER NOT NULL',
    'ticdata_json TEXT NOT NULL',
    'ticdata_smoothed_json TEXT NOT NULL'
    ]:
    tictable_create.append(f'    BPC_{col},')
    tictable_create.append(f'    msn_{col},')
    tictable_create.append(f'    FullScan_{col},')
tictable_create = '\n'.join(tictable_create).strip(',')
tictable_create += '\n);'

In [23]:
print(tictable_create)

CREATE TABLE IF NOT EXISTS ms_runs (
    run_id TEXT PRIMARY KEY,
    sample_id TEXT NOT NULL,
    run_name TEXT NOT NULL,
    run_time TEXT NOT NULL,
    instrument TEXT NOT NULL,
    author TEXT NOT NULL,
    sample_type TEXT NOT NULL,
    run_type TEXT NOT NULL,
    bait TEXT,
    bait_uniprot TEXT,
    tic_max_time INTEGER NOT NUL,
    BPC_tic_auc REAL NOT NULL,
    msn_tic_auc REAL NOT NULL,
    FullScan_tic_auc REAL NOT NULL,
    BPC_tic_intercepts INTEGER NOT NULL,
    msn_tic_intercepts INTEGER NOT NULL,
    FullScan_tic_intercepts INTEGER NOT NULL,
    BPC_avg_peaks_per_timepoint REAL NOT NULL,
    msn_avg_peaks_per_timepoint REAL NOT NULL,
    FullScan_avg_peaks_per_timepoint REAL NOT NULL,
    BPC_mean_intensity INTEGER NOT NULL,
    msn_mean_intensity INTEGER NOT NULL,
    FullScan_mean_intensity INTEGER NOT NULL,
    BPC_max_intensity INTEGER NOT NULL,
    msn_max_intensity INTEGER NOT NULL,
    FullScan_max_intensity INTEGER NOT NULL,
    BPC_ticdata_json TEXT NOT NULL,
 

## TODO
- mergetä tähän tic skripti
- käytä pelkkiä .xy filuja

In [None]:
datadir = 

In [14]:


# Commit changes and close the connection
conn.commit()
conn.close()
