# WS_ch05E.ipynb
# WESmith 12/09/22
## WS created this notebook to follow along with the code in the book:
## 'Bioinformatics with Python Cookbook' by Tiago Antao
## Each recipe will have its own notebook, suffixed by A, B, etc.¶

# RECIPE: 
# FINDING ORTHOLOGUES WITH THE ENSEMBL REST API
## also see book code in Chapter05/Orthology.py¶

In [None]:
import requests
import yaml

In [None]:
ensembl_server = 'http://rest.ensembl.org'
# WS can go there in a browser also to learn about commands

In [None]:
def print_d(d): # cWS onvenience function to print a nested dict
    print(yaml.dump(d, default_flow_style=False))

In [None]:
dd = {'a':1, 'b':{'c':100, 'd': {'e':'hi'}, 'f': 1000}}
print_d(dd)

In [None]:
space = ' '
print('hi{}you'.format(5*space))

### CREATE SUPPORT FUNCTION FOR A WEB REQUEST

In [None]:
def do_request(server, service, *args, **kwargs):
    url_params = ''
    for a in args:
        if a is not None:
            url_params += '/' + a
    req = requests.get('%s/%s%s' % (server, service, url_params),
                       params=kwargs,
                       headers={'Content-Type': 'application/json'})
    if not req.ok:
        req.raise_for_status()
    return req.json()

### 3) CHECK AVAILABLE SPECIES ON THE SERVER

In [None]:
# 315 species when run on 12/09/22
answer = do_request(ensembl_server, 'info/species')

In [None]:
type(answer['species']) # list of dicts

In [None]:
answer['species'][0].keys()

In [None]:
for i, sp in enumerate(answer['species']):
    print(i, sp['common_name'])

### 3) FIND HGNC DATABASES RELATED TO HUMAN DATA

In [None]:
ext_dbs = do_request(ensembl_server, 'info/external_dbs', 
                     'homo_sapiens', filter='HGNC%')
print(ext_dbs)

In [None]:
# WS not clear what this info is saying: just that a HGNC db exists?
for k in ext_dbs: # WS list of dicts
    print_d(k)

### 4) RETRIEVE ENSEMBLE ID FOR THE LCT GENE

In [None]:
answer = do_request(ensembl_server, 'lookup/symbol', 
                    'homo_sapiens', 'LCT')

In [None]:
print_d(answer)

In [None]:
lct_id = answer['id']
lct_id

In [None]:
gene_length = answer['end'] - answer['start'] + 1
gene_length

In [None]:
lct_seq = do_request(ensembl_server, 'sequence/id', lct_id)

In [None]:
lct_seq.keys()

In [None]:
for k, v in lct_seq.items():  # WS
    if k == 'seq': continue  # javascript error: see below
    print('{:20s}: {}'.format(k, v))

In [None]:
# string 49335 bases long: consistent with metadata
len(lct_seq['seq'])

In [None]:
# WS can't print without subindexing: too large; get 
# 'javascript error: too much recursion'
lct_seq['seq'][0:200]

# _____________________________________________

### WS TODO) 
### GET THE GFF FILE FOR THIS LACTASE GENE AND DISPLAY THE EXONs, CDSs, UTRs
### SEE WS_ch05D.ipynb AND WS_experiments.ipynb

# _____________________________________________

### 6) INSPECT OTHER DATABASES KNOWN TO ENSEMBL FOR THIS GENE

In [None]:
lct_xrefs = do_request(ensembl_server, 'xrefs/id', lct_id)

In [None]:
for xref in lct_xrefs:
    print(xref['db_display_name'])
    #print(xref)
    #print_d(xref)  # WS

In [None]:
refs = do_request(ensembl_server, 'xrefs/id', lct_id, 
                  external_db='GO', all_levels='1')

In [None]:
#print(lct_id, refs)
for k in refs: # WS
    #print_d(k) # WS  too many entries to print all
    print(k['display_id'], k['linkage_types'], k['description'])

### 7) GET THE ORTHOLOGUES FOR THIS GENE ON THE HORSE GENOME

In [None]:
hom_response = do_request(ensembl_server, 'homology/id', lct_id, 
                          type='orthologues', sequence='none')

In [None]:
#print(hom_response['data'][0]['homologies'])
homologies = hom_response['data'][0]['homologies']
for homology in homologies:
    #print(homology['target']['species'])
    if homology['target']['species'] != 'equus_caballus':
        continue
    print_d(homology) # WS function
    #print(homology['taxonomy_level'])
    horse_id = homology['target']['id']

In [None]:
horse_id

### 8) GET ENSEMBL RECORD FOR horse_id

In [None]:
horse_req = do_request(ensembl_server, 'lookup/id', horse_id)

In [None]:
print_d(horse_req) # WS function

### WS TODO) GET THE GENE SEQUENCE, GFF FILE AND PLOT GENE
### COMPARE TO HUMAN