# WS_ch05E.ipynb
# WESmith 12/09/22
## WS created this notebook to follow along with the code in the book:
## 'Bioinformatics with Python Cookbook' by Tiago Antao
## Each recipe will have its own notebook, suffixed by A, B, etc.¶

# RECIPE: 
# FINDING ORTHOLOGUES WITH THE ENSEMBL REST API
## also see book code in Chapter05/Orthology.py¶

In [146]:
import requests
import yaml

In [164]:
ensembl_server = 'http://rest.ensembl.org'
# WS can go there in a browser also to learn about commands

In [162]:
def print_d(d): # cWS onvenience function to print a nested dict
    print(yaml.dump(d, default_flow_style=False))

In [163]:
dd = {'a':1, 'b':{'c':100, 'd': {'e':'hi'}, 'f': 1000}}
print_d(dd)

a: 1
b:
  c: 100
  d:
    e: hi
  f: 1000



In [144]:
ptest = pp.PrettyPrinter(depth=4)
ptest.pprint(dd)

{'a': 1, 'b': {'c': 100, 'd': {'e': 'hi'}, 'f': 1000}}


In [130]:
space = ' '
print('hi{}you'.format(5*space))

hi     you


### CREATE SUPPORT FUNCTION FOR A WEB REQUEST

In [4]:
def do_request(server, service, *args, **kwargs):
    url_params = ''
    for a in args:
        if a is not None:
            url_params += '/' + a
    req = requests.get('%s/%s%s' % (server, service, url_params),
                       params=kwargs,
                       headers={'Content-Type': 'application/json'})
    if not req.ok:
        req.raise_for_status()
    return req.json()

### 3) CHECK AVAILABLE SPECIES ON THE SERVER

In [9]:
# 315 species when run on 12/09/22
answer = do_request(ensembl_server, 'info/species')

In [14]:
type(answer['species']) # list of dicts

list

In [15]:
answer['species'][0].keys()

dict_keys(['display_name', 'strain_collection', 'assembly', 'common_name', 'name', 'strain', 'taxon_id', 'accession', 'aliases', 'groups', 'division', 'release'])

In [16]:
for i, sp in enumerate(answer['species']):
    print(i, sp['common_name'])

0 gelada
1 Drill
2 mouse
3 Eurasian red squirrel
4 california sea lion
5 Macaque
6 lyretail cichlid
7 Australian saltwater crocodile
8 horned golden-line barbel
9 Paramormyrops kingsleyae
10 silver-eye
11 Chinook salmon
12 Indian peafowl
13 Black snub-nosed monkey
14 narwhal
15 Crab-eating macaque
16 Collared flycatcher
17 Abingdon island giant tortoise
18 Lesser Egyptian jerboa
19 mouse
20 Upper Galilee mountains blind mole rat
21 Siberian musk deer
22 pig
23 American mink
24 Goat
25 Indian glassy fish
26 American black bear
27 pig
28 sea lamprey
29 lion
30 rabbit
31 Golden snub-nosed monkey
32 round goby
33 spiny chromis
34 Norway rat
35 Arctic ground squirrel
36 huchen
37 naked mole-rat
38 Philippine tarsier
39 helmeted guineafowl
40 red-bellied piranha
41 platypus
42 emu
43 small-eared galago
44 Daurian ground squirrel
45 northern pike
46 European shrew
47 pig
48 golden-collared manakin
49 large flying fox
50 Kakapo
51 tongue sole
52 pig
53 Canada lynx
54 Indian medaka
55 pike-perc

### 3) FIND HGNC DATABASES RELATED TO HUMAN DATA

In [17]:
ext_dbs = do_request(ensembl_server, 'info/external_dbs', 
                     'homo_sapiens', filter='HGNC%')
print(ext_dbs)

[{'description': None, 'display_name': 'HGNC Symbol', 'name': 'HGNC', 'release': '1'}, {'name': 'HGNC_trans_name', 'display_name': 'Transcript name', 'release': '1', 'description': 'transcript name from HGNC'}]


In [155]:
# WS not clear what this info is saying: just that a HGNC db exists?
for k in ext_dbs: # WS list of dicts
    print_d(k)

description: null
display_name: HGNC Symbol
name: HGNC
release: '1'

description: transcript name from HGNC
display_name: Transcript name
name: HGNC_trans_name
release: '1'



### 4) RETRIEVE ENSEMBLE ID FOR THE LCT GENE

In [27]:
answer = do_request(ensembl_server, 'lookup/symbol', 
                    'homo_sapiens', 'LCT')

In [156]:
print_d(answer)

assembly_name: GRCh38
biotype: protein_coding
canonical_transcript: ENST00000264162.7
db_type: core
description: lactase [Source:HGNC Symbol;Acc:HGNC:6530]
display_name: LCT
end: 135837184
id: ENSG00000115850
logic_name: ensembl_havana_gene_homo_sapiens
object_type: Gene
seq_region_name: '2'
source: ensembl_havana
species: homo_sapiens
start: 135787850
strand: -1
version: 10



In [43]:
lct_id = answer['id']
lct_id

'ENSG00000115850'

In [68]:
gene_length = answer['end'] - answer['start'] + 1
gene_length

49335

In [45]:
lct_seq = do_request(ensembl_server, 'sequence/id', lct_id)

In [49]:
lct_seq.keys()

dict_keys(['version', 'molecule', 'id', 'desc', 'query', 'seq'])

In [80]:
for k, v in lct_seq.items():  # WS
    if k == 'seq': continue  # javascript error: see below
    print('{:20s}: {}'.format(k, v))

version             : 10
molecule            : dna
id                  : ENSG00000115850
desc                : chromosome:GRCh38:2:135787850:135837184:-1
query               : ENSG00000115850


In [69]:
# string 49335 bases long: consistent with metadata
len(lct_seq['seq'])

49335

In [71]:
# WS can't print without subindexing: too large; get 
# 'javascript error: too much recursion'
lct_seq['seq'][0:200]

'AACAGTTCCTAGAAAATGGAGCTGTCTTGGCATGTAGTCTTTATTGCCCTGCTAAGTTTTTCATGCTGGGGGTCAGACTGGGAGTCTGATAGAAATTTCATTTCCACCGCTGGTCCTCTAACCAATGACTTGCTGCACAACCTGAGTGGTCTCCTGGGAGACCAGAGTTCTAACTTTGTAGCAGGGGACAAAGACATGTA'

# _____________________________________________

### WS TODO) 
### GET THE GFF FILE FOR THIS LACTASE GENE AND DISPLAY THE EXONs, CDSs, UTRs
### SEE WS_ch05D.ipynb AND WS_experiments.ipynb

# _____________________________________________

### 6) INSPECT OTHER DATABASES KNOWN TO ENSEMBL FOR THIS GENE

In [72]:
lct_xrefs = do_request(ensembl_server, 'xrefs/id', lct_id)

In [92]:
for xref in lct_xrefs:
    print(xref['db_display_name'])
    #print(xref)
    #print_d(xref)  # WS

LRG display in Ensembl gene
Expression Atlas
NCBI gene (formerly Entrezgene)
HGNC Symbol
MIM gene
MIM morbid
Reactome gene
Reactome gene
Reactome gene
Reactome gene
Reactome gene
Reactome gene
Reactome gene
UniProtKB Gene Name
UniProtKB Gene Name
WikiGene


In [82]:
refs = do_request(ensembl_server, 'xrefs/id', lct_id, 
                  external_db='GO', all_levels='1')

In [101]:
#print(lct_id, refs)
for k in refs: # WS
    #print_d(k) # WS  too many entries to print all
    print(k['display_id'], k['linkage_types'], k['description'])

GO:0000016 ['IDA', 'IEA', 'IMP'] lactase activity
GO:0000016 ['IEA'] lactase activity
GO:0000016 ['IBA'] lactase activity
GO:0003824 ['IEA'] catalytic activity
GO:0004336 ['ISS'] galactosylceramidase activity
GO:0004348 ['ISS'] glucosylceramidase activity
GO:0004553 ['IEA'] hydrolase activity, hydrolyzing O-glycosyl compounds
GO:0004553 ['IEA'] hydrolase activity, hydrolyzing O-glycosyl compounds
GO:0004553 ['IEA'] hydrolase activity, hydrolyzing O-glycosyl compounds
GO:0005886 ['TAS'] plasma membrane
GO:0005886 ['IEA'] plasma membrane
GO:0005887 ['TAS'] integral component of plasma membrane
GO:0005975 ['IEA'] carbohydrate metabolic process
GO:0005975 ['IEA'] carbohydrate metabolic process
GO:0005990 ['IDA'] lactose catabolic process
GO:0008152 ['IEA'] metabolic process
GO:0008422 ['IDA', 'ISS'] beta-glucosidase activity
GO:0016020 ['IEA'] membrane
GO:0016021 ['IEA'] integral component of membrane
GO:0016324 ['IEA'] apical plasma membrane
GO:0016787 ['IEA'] hydrolase activity
GO:001679

### 7) GET THE ORTHOLOGUES FOR THIS GENE ON THE HORSE GENOME

In [102]:
hom_response = do_request(ensembl_server, 'homology/id', lct_id, 
                          type='orthologues', sequence='none')

In [157]:
#print(hom_response['data'][0]['homologies'])
homologies = hom_response['data'][0]['homologies']
for homology in homologies:
    #print(homology['target']['species'])
    if homology['target']['species'] != 'equus_caballus':
        continue
    print_d(homology) # WS function
    #print(homology['taxonomy_level'])
    horse_id = homology['target']['id']

dn_ds: null
method_link_type: ENSEMBL_ORTHOLOGUES
source:
  cigar_line: 351M2D1576M
  id: ENSG00000115850
  perc_id: 85.8329
  perc_pos: 92.2678
  protein_id: ENSP00000264162
  species: homo_sapiens
  taxon_id: 9606
target:
  cigar_line: 1929M
  id: ENSECAG00000018594
  perc_id: 85.7439
  perc_pos: 92.1721
  protein_id: ENSECAP00000016483
  species: equus_caballus
  taxon_id: 9796
taxonomy_level: Boreoeutheria
type: ortholog_one2one



In [114]:
horse_id

'ENSECAG00000018594'

### 8) GET ENSEMBL RECORD FOR horse_id

In [158]:
horse_req = do_request(ensembl_server, 'lookup/id', horse_id)

In [160]:
print_d(horse_req) # WS function

assembly_name: EquCab3.0
biotype: protein_coding
canonical_transcript: ENSECAT00000020097.3
db_type: core
description: lactase [Source:VGNC Symbol;Acc:VGNC:19613]
display_name: LCT
end: 19724999
id: ENSECAG00000018594
logic_name: ensembl
object_type: Gene
seq_region_name: '18'
source: ensembl
species: equus_caballus
start: 19678126
strand: -1
version: 3



### WS TODO) GET THE GENE SEQUENCE, GFF FILE AND PLOT GENE
### COMPARE TO HUMAN