In [None]:
# WEBSCRAPE GENE DESCRIPTIONS FOR EACH OF THE CIRCADIAN GENES

In [2]:
import pandas as pd
import requests, sys


In [44]:
def scrape_ensembl(genes,df_0):
    # Scrape gene information
    # https://rest.ensembl.org/documentation/info/xref_name

    l = []
    for i in genes:
        server = "https://rest.ensembl.org"
        ext = "/xrefs/name/human/{}?".format(i)

        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        decoded = r.json()
        l.append("{}; {}".format(
            (decoded[1]['display_id']),
            (decoded[1]['description'])
        ))
    
    # Split list values
    l = [n.split('; ') for n in l]
    # Create df
    df = pd.DataFrame(l, columns=['GeneName','Description'])
    # Merge
    df = pd.merge(df_0,df,on='GeneName').sort_values(by='GeneName')
    
    return df

In [3]:
# IMPORT CIRCADIAN GENES LIST
circadian_genes = pd.read_csv('../data/circadian_genes.list', sep='\t')
candidate_genes = pd.read_csv('../data/circadian_genes_candidate.tab', sep='\t')


In [48]:
# Webscrape gene descriptions
circadian_genes = scrape_ensembl(circadian_genes['GeneName'].values.tolist(),circadian_genes)


In [50]:
# Concatenate columns that contain NaN
candidate_genes['Evidence'] = candidate_genes[
    ['Biosystems','CGDB','GWAS','GO','McMahon']].stack().groupby(level=0).agg(', '.join)
candidate_genes = candidate_genes[['Gene','Evidence','Confidence']]
candidate_genes.rename(columns={'Gene':'GeneName'},inplace=True)


In [51]:
# Merge list of evidence sources with circadian genes
circadian_evidence = pd.merge(circadian_genes,candidate_genes,on='GeneName',how='inner')


In [53]:
# SAVE TO EXCEL
circadian_evidence.to_excel('data/table_1.xlsx', index=False)

In [4]:
candidate_genes

Unnamed: 0,Gene,Biosystems,CGDB,GWAS,GO,McMahon,Confidence
0,PER3,biosystems,cgdb,gwas,go,mcmahon,High
1,UTS2,biosystems,,,,,Low
2,ID3,biosystems,,,go,,Medium
3,HCRTR1,biosystems,,,,,Low
4,HDAC1,biosystems,,,go,,Medium
...,...,...,...,...,...,...,...
2401,NKX2-2,,,,,mcmahon,High
2402,NTN1,,,,,mcmahon,High
2403,OPN5,,,,,mcmahon,High
2404,RHO,,,,,mcmahon,High
