#### Import Statements

In [1]:
import cobra
import riptide
import pandas
import time
import os
import glob
import shutil
import mygene

#### Get Sites

In [2]:
def getSiteNames():
    path="/gpfs/gpfs0/scratch/vjz3qz/sp23-project/txtFiles/*.txt" #gets all text files
    files = glob.glob(path)
    for i in range(len(files)):
        files[i]=files[i].split("/")[-1] #removes path
        files[i]=files[i].split(".")[0] #removes extension
    return files
#get list of sites
sites=getSiteNames()
sites

['brca-t',
 'esophagus_muc',
 'coad-t',
 'kidney',
 'bladder',
 'chol-t',
 'kirp',
 'kirc',
 'cervix',
 'esca-t',
 'thca-t',
 'esophagus_gas',
 'prostate',
 'cesc-t',
 'thyroid',
 'breast',
 'coad',
 'chol',
 'cesc',
 'ucec-t',
 'kich',
 'prad',
 'read-t',
 'esophagus_mus',
 'lihc',
 'lihc-t',
 'read',
 'stad-t',
 'stad',
 'liver',
 'lusc-t',
 'stomach',
 'hnsc-t',
 'ucec',
 'hnsc',
 'kirc-t',
 'colon',
 'luad',
 'prad-t',
 'esca',
 'ucs-t',
 'brca',
 'kich-t',
 'kirp-t',
 'lusc',
 'luad-t',
 'blca',
 'blca-t',
 'lung',
 'salivary',
 'uterus']

#### Make File Tree

In [None]:
path="/gpfs/gpfs0/scratch/vjz3qz/sp23-project/NormalizedCancerData/"
for site in sites:
    os.mkdir(path+site)
    os.mkdir(path+site+"/TSVFiles")
    os.mkdir(path+site+"/RiptideOutputs")

#### Remove an old File Tree

In [None]:
#remove old file tree
path="/gpfs/gpfs0/scratch/vjz3qz/sp23-project/NormalizedCancerData/"
shutil.rmtree(path, ignore_errors=False, onerror=None)

#### Entrez to Ensembl

In [4]:
def geneConverter(entrezIDs): #input a list of entrez ID's
    mg = mygene.MyGeneInfo()
    out = mg.querymany(entrezIDs, scopes='entrezgene', fields='ensembl.gene', species='human')
    l=[]
    for i in range(len(out)):
        try:
            l.append(out[i]['ensembl']['gene'])
        except (KeyError,TypeError) as error:
            try:
                l.append(out[i]['ensembl'][0]['gene'])
            except KeyError:
                l.append("Not ENSG")
    return l

#### Create TSV files for all sites

In [5]:
def createTSV(site): #update for ensemble ID's
    file= pandas.read_csv("/gpfs/gpfs0/scratch/vjz3qz/sp23-project/txtFiles/"+site+".txt", sep = "\t",header=None, low_memory=False)
    file.drop(file.columns[[0]], axis=1, inplace=True)
    #drops columns
    file.drop([0], axis=0, inplace=True)
    l=pandas.DataFrame(geneConverter(file.iloc[:,0].values))
    file[1]=l.shift()[1:][0]
    #file.insert(loc = 0,column="EnsemblID",value = l)
    os.chdir("/gpfs/gpfs0/scratch/vjz3qz/sp23-project/NormalizedCancerData/"+site+"/TSVFiles")
    for i in range(1,len(file.columns)):
        df=file.iloc[:,[0,i]]
        df.to_csv(site+str(i-1)+".tsv", sep="\t",header=False, index=False)

In [6]:
for site in sites:
    createTSV(site)
    print(site+" finished")

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-19738...done.
Finished.
174 input query terms found dup hits:
	[('84220', 3), ('24150', 4), ('84920', 2), ('5554', 2), ('389852', 2), ('8339', 5), ('4831', 2), ('6
1186 input query terms found no hit:
	['0', '0', '0', '0', '0', '0', '0', '388289', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
brca-t finished
querying 1-1000...done.
querying 1001-2000...done.
qu

#### Load and Optimize Model

In [3]:
model = cobra.io.read_sbml_model("/gpfs/gpfs0/scratch/vjz3qz/sp23-project/Human-GEM/model/Human-GEM.xml")
model.solver="glpk"
model.slim_optimize()

187.35362997658086

#### Run Riptide

In [None]:
def readTranscription(site, TSVLength): #string site, int length
    path="/gpfs/gpfs0/scratch/vjz3qz/sp23-project/NormalizedCancerData/"+site+"/TSVFiles/"
    sequences = [site]*TSVLength
    riptideFiles=[]
    for i in range(len(sequences)):
        sequences[i]=sequences[i]+str(i)
        tempFile=riptide.read_transcription_file(path+sequences[i]+".tsv")
        riptideFiles.append(tempFile)
    return riptideFiles

In [None]:
def saveRiptideOutput(site, riptideFiles, TSVLength, start): #string site, riptideFiles, int length, start after skipped file, objective fraction
    path="/gpfs/gpfs0/scratch/vjz3qz/sp23-project/NormalizedCancerData/"+site+"/TSVFiles/"
    #start should default to 0
    for i in range(start,len(riptideFiles)):
        print(time.ctime())
        tempFile=riptide.contextualize(model=model,transcriptome=riptideFiles[i])
        current=path+"RiptideOutputs/"+site+str(i)
        riptide.save_output(riptide_obj=tempFile,path=current)

In [None]:
def runRiptide(site, TSVLength, start):
    riptideFiles=readTranscription(site, TSVLength)
    saveRiptideOutput(site, riptideFiles, TSVLength, start)

In [None]:
for site in sites:
    #15 is placeholder variable
    #0 is default start and 0.8 is default objective fraction
    runRiptide(site, 15, 0)