In [1]:
import requests
from core import config as cfg
import pandas as pd
import numpy as np

In [2]:
import time
from tqdm import tqdm
import os

# Request ATG9A

##  Request data by the name of genes from '/genes/'

In [112]:
# These parameters can be modified to match any search criteria following
# the rules outlined in the Wiki: https://wiki.thebiogrid.org/doku.php/orcs:webservice
# In this example, we've chosen to only receive scores for the genes ATG9A
# and also to limit to onlyhuman results. Also, we only want scores considered to be significant
# hits.
request_url = cfg.BASE_URL + "/genes/"
params = {
    "accesskey": cfg.ACCESS_KEY,
    "name": "ATG9A",
    #"geneID": "79065",#IDENTIFIER_ID
    #"organismID": "9606",
    "hit": "yes",
    "format": "json"
}
#Inorder to get data from the server, we need to use http as a protocal between the server and 
#our client. To make a http request using python, we can choose the Request library.
r = requests.get( request_url, params = params )

#Extracting the data in jason format
scores = r.json( )
print( "Number of Scores Found: " + str(len(scores)) )



Number of Scores Found: 74


In [113]:
# Step through all the scores and build a unique set of screens
# as well as build a two dimentional structure for storing results
# where the identifier ID is the outer level and the screen ID is the
# inner level. This will allow us to lookup genes and then screens for that
# gene later on.
screen_ids = set( )
genes = {}
count=0

for score in scores :
    screen_ids.add( score['SCREEN_ID'] )
   
    if score['IDENTIFIER_ID'] not in genes :
        genes[score['IDENTIFIER_ID']] = {}

    genes[score['IDENTIFIER_ID']][score['SCREEN_ID']] = score


In [114]:
print( "Number of Unique Screen IDs Found: " + str(len(screen_ids)) )
print( "Number of Genes Found: " + str(len(genes)) )


Number of Unique Screen IDs Found: 74
Number of Genes Found: 2


## Request data by the screen number from '/screens/'

In [80]:
# Make a new request for annotation about all the new screens
request_url = cfg.BASE_URL + "/screens/"

# These parameters can be modified to match any search criteria following
# the rules outlined in the Wiki: https://wiki.thebiogrid.org/doku.php/orcs:webservice
# In this example, we are passing in a list of actual screen IDs we want
params = {
    "accesskey": cfg.ACCESS_KEY,
     "name": "ATG9A",
    "screenID": "|".join( screen_ids ),
    "format": "json"
}

r = requests.get( request_url, params = params )
screens = r.json( )
print( "Number of Screen Details Retrieved: " + str(len(screens)) )


Number of Screen Details Retrieved: 74


In [81]:
# Step through each screen and build a nice index of screens by screen_id
screen_lookup = {}
for screen in screens :
    screen_lookup[screen['SCREEN_ID']] = screen


### Print out a score and its associated screen annotation for ATG9A and screen 590

In [82]:
# Print out a score and its associated screen annotation
# for gene 79065 and screen 590'
print('Print out a score and its associated screen annotation for gene 79065 and screen 590')
print( genes['79065']['590'] )



Print out a score and its associated screen annotation for gene 79065 and screen 590
{'SCREEN_ID': '590', 'IDENTIFIER_ID': '79065', 'IDENTIFIER_TYPE': 'gene', 'OFFICIAL_SYMBOL': 'ATG9A', 'ALIASES': 'APG9L1|MGD3208|mATG9|FLJ22169', 'ORGANISM_ID': '9606', 'ORGANISM_OFFICIAL': 'Homo sapiens', 'SCORE.1': '4.484726063', 'SCORE.2': '2.93e-60', 'SCORE.3': '-', 'SCORE.4': '-', 'SCORE.5': '-', 'HIT': 'YES', 'SOURCE': 'BioGRID ORCS'}


In [83]:
print(screen_lookup['590'] )

{'SCREEN_ID': '590', 'SOURCE_ID': '28877469', 'SOURCE_TYPE': 'pubmed', 'AUTHOR': 'Goodwin JM (2017)', 'SCREEN_NAME': '6-PMID28877469', 'SCORES_SIZE': '1120', 'FULL_SIZE': '1120', 'FULL_SIZE_AVAILABLE': 'Yes', 'NUMBER_OF_HITS': '71', 'ANALYSIS': 'DESeq2', 'SIGNIFICANCE_INDICATOR': 'Score Significance', 'SIGNIFICANCE_CRITERIA': 'Score.1 (Log2) > 1', 'THROUGHPUT': 'High Throughput', 'SCREEN_TYPE': 'Phenotype Screen', 'SCREEN_FORMAT': 'Pool', 'EXPERIMENTAL_SETUP': 'Timecourse', 'DURATION': '7 Days', 'CONDITION_NAME': '-', 'CONDITION_DOSAGE': '-', 'MOI': '-', 'LIBRARY': 'CRISPRn Minipool (Goodwin, 2017)', 'LIBRARY_TYPE': 'CRISPRn', 'METHODOLOGY': 'Knockout', 'ENZYME': 'CAS9', 'CELL_LINE': 'H4 neuroglioma cell', 'CELL_TYPE': 'glioma cell line', 'PHENOTYPE': 'protein/peptide accumulation', 'SCORE_COL_COUNT': '2', 'SCORE.1_TYPE': 'Log2', 'SCORE.2_TYPE': 'p-Value', 'SCORE.3_TYPE': '-', 'SCORE.4_TYPE': '-', 'SCORE.5_TYPE': '-', 'ORGANISM_ID': '9606', 'ORGANISM_OFFICIAL': 'Homo sapiens', 'NOTES':

### Information from the dataframe

In [84]:
# Convert from dic to pandas table
screen_lookup_df = pd.DataFrame.from_dict( screen_lookup, orient='index' )
genes_df = pd.DataFrame.from_dict({(i,j): genes[i][j] 
                           for i in genes.keys() 
                           for j in genes[i].keys()},
                       orient='index')

In [85]:
genes_df.columns

Index(['SCREEN_ID', 'IDENTIFIER_ID', 'IDENTIFIER_TYPE', 'OFFICIAL_SYMBOL',
       'ALIASES', 'ORGANISM_ID', 'ORGANISM_OFFICIAL', 'SCORE.1', 'SCORE.2',
       'SCORE.3', 'SCORE.4', 'SCORE.5', 'HIT', 'SOURCE'],
      dtype='object')

In [86]:
screen_lookup_df.columns

Index(['SCREEN_ID', 'SOURCE_ID', 'SOURCE_TYPE', 'AUTHOR', 'SCREEN_NAME',
       'SCORES_SIZE', 'FULL_SIZE', 'FULL_SIZE_AVAILABLE', 'NUMBER_OF_HITS',
       'ANALYSIS', 'SIGNIFICANCE_INDICATOR', 'SIGNIFICANCE_CRITERIA',
       'THROUGHPUT', 'SCREEN_TYPE', 'SCREEN_FORMAT', 'EXPERIMENTAL_SETUP',
       'DURATION', 'CONDITION_NAME', 'CONDITION_DOSAGE', 'MOI', 'LIBRARY',
       'LIBRARY_TYPE', 'METHODOLOGY', 'ENZYME', 'CELL_LINE', 'CELL_TYPE',
       'PHENOTYPE', 'SCORE_COL_COUNT', 'SCORE.1_TYPE', 'SCORE.2_TYPE',
       'SCORE.3_TYPE', 'SCORE.4_TYPE', 'SCORE.5_TYPE', 'ORGANISM_ID',
       'ORGANISM_OFFICIAL', 'NOTES', 'SOURCE'],
      dtype='object')

In [87]:
selected_genes_df = genes_df[['IDENTIFIER_ID','SCREEN_ID','ALIASES','OFFICIAL_SYMBOL','HIT']]
selected_genes_df.head()

Unnamed: 0,Unnamed: 1,IDENTIFIER_ID,SCREEN_ID,ALIASES,OFFICIAL_SYMBOL,HIT
79065,27,79065,27,APG9L1|MGD3208|mATG9|FLJ22169,ATG9A,YES
79065,29,79065,29,APG9L1|MGD3208|mATG9|FLJ22169,ATG9A,YES
79065,141,79065,141,APG9L1|MGD3208|mATG9|FLJ22169,ATG9A,YES
79065,159,79065,159,APG9L1|MGD3208|mATG9|FLJ22169,ATG9A,YES
79065,149,79065,149,APG9L1|MGD3208|mATG9|FLJ22169,ATG9A,YES


In [88]:
screen_lookup_df_selected = screen_lookup_df[['SCREEN_ID','SIGNIFICANCE_CRITERIA','CELL_LINE','ENZYME','PHENOTYPE','NUMBER_OF_HITS','ANALYSIS','NOTES']]
screen_lookup_df_selected.head()

Unnamed: 0,SCREEN_ID,SIGNIFICANCE_CRITERIA,CELL_LINE,ENZYME,PHENOTYPE,NUMBER_OF_HITS,ANALYSIS,NOTES
24,24,-,K-562 cell,CAS9,toxin resistance,895,CasTLE,authors used a cut-off of FDR<= 10% to determi...
27,27,Score.1 (Bayes Factor) > 3.2,143B,CAS9,viability,2197,BAGEL,Genes with a Bayes Factor (BF) above the thres...
29,29,Score.1 (Bayes Factor) > 3.57,DLD-1 cell,CAS9,viability,1893,BAGEL,Genes with a Bayes Factor (BF) above the thres...
81,81,Score.1 (Log2) < -1,BA/F3 cell,CAS9,viability,3426,Log2 Fold Change (L2FC),
82,82,Score.1 (Log2) < -1,BA/F3 cell,CAS9,viability,4759,Log2 Fold Change (L2FC),


In [89]:
screen_lookup_df_selected[screen_lookup_df_selected['SCREEN_ID'] == '590']

Unnamed: 0,SCREEN_ID,SIGNIFICANCE_CRITERIA,CELL_LINE,ENZYME,PHENOTYPE,NUMBER_OF_HITS,ANALYSIS,NOTES
590,590,Score.1 (Log2) > 1,H4 neuroglioma cell,CAS9,protein/peptide accumulation,71,DESeq2,targeted mini-pool screen for genes whose depl...


In [92]:
summary = pd.merge(selected_genes_df,screen_lookup_df_selected,on='SCREEN_ID',how='left')

In [94]:
summary[summary['SCREEN_ID']=='590']

Unnamed: 0,IDENTIFIER_ID,SCREEN_ID,ALIASES,OFFICIAL_SYMBOL,HIT,SIGNIFICANCE_CRITERIA,CELL_LINE,ENZYME,PHENOTYPE,NUMBER_OF_HITS,ANALYSIS,NOTES
64,79065,590,APG9L1|MGD3208|mATG9|FLJ22169,ATG9A,YES,Score.1 (Log2) > 1,H4 neuroglioma cell,CAS9,protein/peptide accumulation,71,DESeq2,targeted mini-pool screen for genes whose depl...


# Gathering Data from Biogrid

## Request by screen_id from /screens(getting notes info)

In [55]:
request_url = cfg.BASE_URL + "/screens/"
screenID_list=[str(x) for x in range(1,1020)]

params = {
    "accesskey": cfg.ACCESS_KEY,
    "screenID": "|".join(screenID_list),
    "format": "json"
}

r = requests.get( request_url, params = params )
screens_1 = r.json( )
print( "Number of Screens Found: " + str(len(screens)) )

with open('/Users/xinyutang/Desktop/biogridData/screens.json', 'w') as f:
    json.dump(screens, f)


Number of Screens Found: 561


In [99]:
screens =pd.read_json ('/Users/xinyutang/Desktop/biogridData/DataPreprocessing/screens.json')

In [12]:
screens_after650 = np.load('/Users/xinyutang/Desktop/biogridData/DataPreprocessing/screens652.npy',allow_pickle=True)

## Request single screen from /screen

In [21]:
def get_screen(screenID):
    
    screen_id = screenID
    request_url = cfg.BASE_URL + "/screen/" + str(screen_id)

    params = {
        "accesskey": cfg.ACCESS_KEY,
        "format": "json"
    }

    r = requests.get( request_url, params = params )
    screen = r.json( )
    data = {}
    
    for row in screen:
        data[row['IDENTIFIER_ID']] = row


    dataset = pd.DataFrame.from_dict( data, orient='index' )
    
    return dataset

In [5]:
screenID_list=[int(x) for x in range(1,1200)]
df_list=[]

for i in tqdm(screenID_list):
    time.sleep(3)
    try:
        df = get_screen(i)
        df_list.append(df)
        
    except:
            pass
        
screens = pd.concat(df_list,axis=0)
screens = screens.reset_index(drop=True)
screens.to_pickle(('/Users/xinyutang/Desktop/biogridData/DataPreprocessing/single_screen.pkl'))

100%|██████████| 1199/1199 [1:18:22<00:00,  3.92s/it]


In [101]:
screens=pd.read_pickle(('/Users/xinyutang/Desktop/biogridData/DataPreprocessing/single_screen.pkl'))

In [16]:
single_screen=np.load('/Users/xinyutang/Desktop/biogridData/DataPreprocessing/single_screen650.npy',allow_pickle=True)

In [27]:
single_screen[0:1]

IndexError: too many indices for array

### Web crawling on screen page 

## Request by gene_id from /genes

In [4]:
def extractGenes(lower_bound,upper_bound):
    geneID_list=[str(x) for x in range(lower_bound,upper_bound)]

    request_url = cfg.BASE_URL + "/genes/"
    params = {
        "accesskey": cfg.ACCESS_KEY,
    #     "name": "ATG9A",
        "geneID": "|".join(geneID_list),
        #"organismID": "9606",
    #     "hit": "yes",
        "format": "json"
    }
    r = requests.get( request_url, params = params )

    genes = r.json( )
    print( "Number of Scores Found: " + str(len(scores)) )

    with open(f'/Users/xinyutang/Desktop/biogridData/genes{parameter}.json', 'w') as f:
        json.dump(genes, f)


## Load genes

In [7]:
directory = '/Users/xinyutang/Desktop/biogridData'

genes_list=[]
for file in tqdm(os.listdir(directory)):
    time.sleep(3)
    filename = os.fsdecode(file)
    if filename.startswith("gene"): 
        path = os.path.join(directory, filename)
        genes =pd.read_json (path)
        
        try:
            df = genes[['SCREEN_ID','IDENTIFIER_ID','OFFICIAL_SYMBOL','SCORE.1','ALIASES','HIT']]
            df = df.sort_values('SCREEN_ID',ascending='True')
            row_list=[]

            genes_list.append(df)
        
        except Exception:
            pass
    else:
        continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows['SCORE.1.RANK'] = rows['SCORE.1'].rank(ascending=0)
100%|██████████| 790/790 [45:23<00:00,  3.45s/it]


In [8]:
allgenes = pd.concat(genes_list,axis=0)
allgenes = allgenes.sort_values('SCREEN_ID').reset_index(drop=True)
allgenes.to_pickle('/Users/xinyutang/Desktop/biogridData/DataPreprocessing/allGenes.pkl')

In [45]:
allgenes1 = pd.read_pickle('/Users/xinyutang/Desktop/biogridData/DataPreprocessing/allGenes_GENES0.pkl')
allgenes2 = pd.read_pickle('/Users/xinyutang/Desktop/biogridData/DataPreprocessing/allGenes_GENES400000.pkl')

In [62]:
genes_ID_1 = allgenes1[['IDENTIFIER_ID','OFFICIAL_SYMBOL']]
genes_ID_1 = genes_ID_1.drop_duplicates('IDENTIFIER_ID',keep='first').sort_values('IDENTIFIER_ID')
genes_ID_2 = allgenes2[['IDENTIFIER_ID','OFFICIAL_SYMBOL']]
genes_ID_2 = genes_ID_2.drop_duplicates('IDENTIFIER_ID',keep='first').sort_values('IDENTIFIER_ID')
allGenesID = pd.concat([genes_ID_1,genes_ID_2],axis=0).sort_values('IDENTIFIER_ID')
allGenesID.to_pickle('/Users/xinyutang/Desktop/biogridData/GENEID/allGenesID.pkl')

In [83]:
allGenesID[allGenesID['IDENTIFIER_ID']==79065]

Unnamed: 0,IDENTIFIER_ID,OFFICIAL_SYMBOL
14806,79065,ATG9A


In [96]:
screens

Unnamed: 0,SCREEN_ID,IDENTIFIER_ID,IDENTIFIER_TYPE,OFFICIAL_SYMBOL,ALIASES,ORGANISM_ID,ORGANISM_OFFICIAL,SCORE.1,SCORE.2,SCORE.3,SCORE.4,SCORE.5,HIT,SOURCE
0,1,29974,gene,A1CF,ACF|ACF64|ACF65|APOBEC1CF|ASP,9606,Homo sapiens,0,-,-,-,-,NO,BioGRID ORCS
1,1,8086,gene,AAAS,AAA|AAASb|ADRACALA|ADRACALIN|ALADIN|GL003,9606,Homo sapiens,0,-,-,-,-,NO,BioGRID ORCS
2,1,22848,gene,AAK1,KIAA1048|DKFZp686K16132,9606,Homo sapiens,0,-,-,-,-,NO,BioGRID ORCS
3,1,26574,gene,AATF,BFR2|CHE-1|CHE1|DED,9606,Homo sapiens,0.115,-,-,-,-,NO,BioGRID ORCS
4,1,9625,gene,AATK,AATYK|AATYK1|LMR1|LMTK1|PPP1R77|p35BP|KIAA0641,9606,Homo sapiens,0,-,-,-,-,NO,BioGRID ORCS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8490166,686,100287478,gene,USP17L21,-,9606,Homo sapiens,114,-,-,-,-,YES,BioGRID ORCS
8490167,686,6845,gene,VAMP7,SYBL1|TI-VAMP|TIVAMP|VAMP-7,9606,Homo sapiens,13,-,-,-,-,YES,BioGRID ORCS
8490168,686,55062,gene,WIPI1,ATG18|ATG18A|WIPI49|FLJ10055,9606,Homo sapiens,25,-,-,-,-,YES,BioGRID ORCS
8490169,686,653220,gene,XAGE1B,CT12.1|CT12.1A|CT12.1B|CTP9|GAGED2|XAGE-1|XAGE...,9606,Homo sapiens,93,-,-,-,-,YES,BioGRID ORCS


## Visualize these data and give basic summary 

In [9]:
allGenes=pd.read_pickle('/Users/xinyutang/Desktop/biogridData/DataPreprocessing/allGenes.pkl')
screens =pd.read_json ('/Users/xinyutang/Desktop/biogridData/screens.json')

In [21]:
screens_selected = screens[['SCREEN_ID','PHENOTYPE','SCREEN_TYPE','NOTES','THROUGHPUT','ENZYME','CELL_LINE','METHODOLOGY','ORGANISM_ID','SIGNIFICANCE_CRITERIA','SIGNIFICANCE_INDICATOR','FULL_SIZE','EXPERIMENTAL_SETUP']]
summary_selected = pd.merge(allGenes,screens_selected,on='SCREEN_ID',how='left')
summary_selected = summary_selected[['SCREEN_ID','SCREEN_TYPE','IDENTIFIER_ID','OFFICIAL_SYMBOL','PHENOTYPE','NOTES','CELL_LINE','METHODOLOGY','THROUGHPUT','ENZYME','ORGANISM_ID','SCORE.1.RANK','HIT','ALIASES','FULL_SIZE']]

In [22]:
summary_selected.shape

(8127561, 15)

In [27]:
summary_selected = summary_selected[['SCREEN_ID','SCREEN_TYPE','IDENTIFIER_ID','OFFICIAL_SYMBOL','PHENOTYPE','NOTES','CELL_LINE','METHODOLOGY','THROUGHPUT','ENZYME','ORGANISM_ID','SCORE.1.RANK','HIT','ALIASES','FULL_SIZE']]
summary_selected['Rank'] =  summary_selected['SCORE.1.RANK'].astype(str)+'/'+summary_selected['FULL_SIZE'].astype(str)

In [31]:
ATG9A_screen590 = summary_selected[(summary_selected['SCREEN_ID']==590)&(summary_selected['IDENTIFIER_ID']==79065)]

In [32]:
ATG9A_screen590

Unnamed: 0,SCREEN_ID,SCREEN_TYPE,IDENTIFIER_ID,OFFICIAL_SYMBOL,PHENOTYPE,NOTES,CELL_LINE,METHODOLOGY,THROUGHPUT,ENZYME,ORGANISM_ID,SCORE.1.RANK,HIT,ALIASES,FULL_SIZE,Rank
7873424,590,Phenotype Screen,79065,ATG9A,protein/peptide accumulation,targeted mini-pool screen for genes whose depl...,H4 neuroglioma cell,Knockout,High Throughput,CAS9,9606,1.0,YES,APG9L1|MGD3208|mATG9|FLJ22169,1120,1.0/1120


In [30]:
summary_selected.IDENTIFIER_ID.nunique()

50394