In [1]:
import os
import subprocess, shlex
import simplejson as json
import pandas as pd
from dashboard.data_load import load_mouse_blastp_results, load_esmfold
from dashboard import util
from pathlib import Path

import sqlite3



In [2]:
from dashboard.etl import DATA_DIR, CACHE_DIR


In [3]:
db_address = DATA_DIR.joinpath('autoimmune_expression_atlas_v1.db')
df = util.query_de_transcripts('ENST00000510048.1', db_address).fillna(0.01)

In [4]:
  
with sqlite3.connect(db_address) as sqliteConnection:
    available_studies_df = pd.read_sql("SELECT * FROM transcript_de LIMIT 20", sqliteConnection)

available_studies_df

Unnamed: 0,velia_study,contrast
0,ERP106487,CROHNSDISEASE_VS_CONTROL
1,ERP106487,ULCERATIVECOLITIS_VS_CONTROL
2,ERP106487,ULCERATIVECOLITIS_VS_CROHNSDISEASE
3,GSE102371,TYPE_1_DIABETES_vs_CONTROL
4,GSE110914,PRE_TYPE_1_DIABETES_vs_CONTROL
5,GSE110914,TYPE_1_DIABETES_vs_CONTROL
6,GSE110914,TYPE_1_DIABETES_vs_PRE_TYPE_1_DIABETES
7,GSE112087,SYSTEMIC_LUPUS_ERYTHEMATOSUS_vs_CONTROL
8,GSE120178,RHEUMATOID_ARTHRITIS_vs_CONTROL
9,GSE122459,SYSTEMIC_LUPUS_ERYTHEMATOSUS_vs_CONTROL


In [6]:
metadata = {k:v for k, v in df[['contrast', 'velia_study']].values}
metadata

{'SRP100787': 'ULCERATIVE_COLITIS_vs_ULCERATIVE_COLITIS_REMISSION'}

In [7]:
df[['velia_study', 'contrast']]

Unnamed: 0,velia_study,contrast
0,SRP100787,CROHNS_DISEASE_vs_CONTROL
1,SRP100787,ULCERATIVE_COLITIS_vs_CONTROL
2,SRP100787,ULCERATIVE_COLITIS_vs_ULCERATIVE_COLITIS_REMIS...


In [11]:
cache_dir = Path('/home/ubuntu/repos/dashboard/cache/')

In [8]:
output_prefix = CACHE_DIR.joinpath('protein_data')

In [13]:
subprocess.run(shlex.split(f'docker run  --rm -v {output_prefix}:/data -v /efs/databases/blast:/db ncbi/blast blastp -task blastp-fast -outfmt 15 -db /db/mouse.protein.faa -query /data/protein_tools_input.fasta -max_target_seqs 20 -out /data/blastp.results.json'))


CompletedProcess(args=['docker', 'run', '--rm', '-v', '/home/ubuntu/repos/dashboard/cache/protein_data:/data', '-v', '/efs/databases/blast:/db', 'ncbi/blast', 'blastp', '-task', 'blastp-fast', '-outfmt', '15', '-db', '/db/mouse.protein.faa', '-query', '/data/protein_tools_input.fasta', '-max_target_seqs', '20', '-out', '/data/blastp.results.json'], returncode=0)

In [22]:
cmd = f'docker run --gpus all -it -v /home/ubuntu/repos/protein_tools:/opt/openfold -v {output_prefix}:/data 328315166908.dkr.ecr.us-west-2.amazonaws.com/esmfold:latest python /opt/openfold/run_batch_fasta.py /data/protein_tools_input.fasta /data/esmfold.jsonlines'
subprocess.run(shlex.split(cmd))

the input device is not a TTY


CompletedProcess(args=['docker', 'run', '--gpus', 'all', '-it', '-v', '/home/ubuntu/repos/protein_tools:/opt/openfold', '-v', '/home/ubuntu/repos/dashboard/cache/protein_data:/data', '328315166908.dkr.ecr.us-west-2.amazonaws.com/esmfold:latest', 'python', '/opt/openfold/run_batch_fasta.py', '/data/protein_tools_input.fasta', '/data/esmfold.jsonlines'], returncode=1)

In [23]:
print(cmd)

docker run --gpus all -it -v /home/ubuntu/repos/protein_tools:/opt/openfold -v /home/ubuntu/repos/dashboard/cache/protein_data:/data 328315166908.dkr.ecr.us-west-2.amazonaws.com/esmfold:latest python /opt/openfold/run_batch_fasta.py /data/protein_tools_input.fasta /data/esmfold.jsonlines


In [17]:
esmfold = load_esmfold()


2023-12-04 20:56:16.680 
  command:

    streamlit run /opt/conda/envs/veliadash_updated/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
2023-12-04 20:56:16.681 No runtime found, using MemoryCacheStorageManager


In [19]:
len(esmfold.keys())

13

In [20]:
esmfold.keys()

dict_keys(['MVQGSCWAAGKRGCTS', 'MISAHCDLCFLGSRILLPQPPK', 'MRSRVHTPRGGEALNRETA', 'MPRINREGRRWLFSCLIVFLLSELESHEGEQQKTTN', 'MRFFRLTFKCFVDCF', 'MILQSLLFLQRLLMISTKPAVVLLWPLLKKVENTLMQHVHPNLPA', 'MQHPGEPTCILLARCLECNTTCESENLPRPPAMD', 'MTHNSGSGPGTSNIIK', 'MLEGGFRRKMILCILSLHPNF', 'MANDRRRAQTGPCYDLSWSGSD', 'MQEVSRKGRTPGFEQNFGL', 'MHYLIKRRMDLLFLQLVFQELI', 'MAEIRTLHFAACALRPQNLLYPT'])

In [14]:
with open(cache_dir.joinpath('protein_data', 'blastp.results.json'), 'r') as fopen:
    blastp = json.load(fopen)
    blastp = blastp['BlastOutput2']