In [11]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import numpy as np
import pandas as pd
import scipy.io
import collections
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import re
import random as rn
from bs4 import BeautifulSoup
import nsaba.nsaba as nsaba

In [5]:
def fetch_entrez_ids(term, id_num):
    entrez_ids = []
    # Phantom JS driver
    strs = ["http://www.genecards.org/Search/Keyword?startPage=0&queryString=", 
            term, "&pageSize=", str(id_num)]
    search_url = ''.join(strs)
    driver = webdriver.PhantomJS()
    driver.get(search_url)
    try:
        check = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "searchResults")))
    finally:
        search_table = driver.find_element_by_id("searchResults")
        
    top_genes = search_table.find_elements_by_class_name("gc-gene-symbol")
    gene_urls = []
    for gene in top_genes:
        el = gene.find_element_by_tag_name("a")
        gene_urls.append(el.get_attribute("href"))
        
    for gene_url in gene_urls:
        driver.get(gene_url)
        try:
            check = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "gc-subsection")))
        finally:
            subsections = driver.find_elements_by_class_name("gc-subsection")
            
        text = subsections[1].text
        result = re.search("Entrez\sGene:\s([0-9]*)", text)
        entrez_ids.append(result.group(1))
        print (entrez_ids[-1])
        
    driver.close()
    
    return entrez_ids

In [6]:
genes = ['MECP2', 'Staufen', 'Pumilio', 'PSD-95', 'Homer1b', 'SHANK', 'beta-neurexin',
          'NLGN1', 'MEF2', 'NPAS4', 'FMRP', 'Homer1a', 'gephyrin', 'alpha-neurexin',
          'NLGN2', 'GABAaR','GRIA1', 'GRIA2', 'GRIA3', 'GRIA4', 'GABRB1', 'GABRB2', 
          'GABRB3', 'GABRD', 'GABRE', 'GABRG1', 'GABRG2', 'GABRG3', 'GABRP', 'GABRQ',
          'GABRR1', 'GABRR2', 'GABRR3', 'GABARAP', 'GABARAPL1', 'GABARAPL2', 'GABARAPL3',
          'GABRA1', 'GABRA2', 'GABRA3', 'GABRA4', 'GABRA5', 'GABRA6']

entrez_ids = []
for gene in genes:
    entrez_ids += fetch_entrez_ids(gene, 1)

4204
27067
9698
22839
9456
81858
9378
22871
4205
266743
2332
9456
10243
9378
57555
2562
2890
2891
2892
2893
2560
2561
2562
2563
2564
2565
2566
2567
2568
55879
2569
2570
200959
11337
23710
11345
23766
2554
2555
2556
2557
2558
2559


In [8]:
zip(genes, entrez_ids)

[('MECP2', u'4204'),
 ('Staufen', u'27067'),
 ('Pumilio', u'9698'),
 ('PSD-95', u'22839'),
 ('Homer1b', u'9456'),
 ('SHANK', u'81858'),
 ('beta-neurexin', u'9378'),
 ('NLGN1', u'22871'),
 ('MEF2', u'4205'),
 ('NPAS4', u'266743'),
 ('FMRP', u'2332'),
 ('Homer1a', u'9456'),
 ('gephyrin', u'10243'),
 ('alpha-neurexin', u'9378'),
 ('NLGN2', u'57555'),
 ('GABAaR', u'2562'),
 ('GRIA1', u'2890'),
 ('GRIA2', u'2891'),
 ('GRIA3', u'2892'),
 ('GRIA4', u'2893'),
 ('GABRB1', u'2560'),
 ('GABRB2', u'2561'),
 ('GABRB3', u'2562'),
 ('GABRD', u'2563'),
 ('GABRE', u'2564'),
 ('GABRG1', u'2565'),
 ('GABRG2', u'2566'),
 ('GABRG3', u'2567'),
 ('GABRP', u'2568'),
 ('GABRQ', u'55879'),
 ('GABRR1', u'2569'),
 ('GABRR2', u'2570'),
 ('GABRR3', u'200959'),
 ('GABARAP', u'11337'),
 ('GABARAPL1', u'23710'),
 ('GABARAPL2', u'11345'),
 ('GABARAPL3', u'23766'),
 ('GABRA1', u'2554'),
 ('GABRA2', u'2555'),
 ('GABRA3', u'2556'),
 ('GABRA4', u'2557'),
 ('GABRA5', u'2558'),
 ('GABRA6', u'2559')]

In [10]:
[int(eid) for eid in entrez_ids]

[4204,
 27067,
 9698,
 22839,
 9456,
 81858,
 9378,
 22871,
 4205,
 266743,
 2332,
 9456,
 10243,
 9378,
 57555,
 2562,
 2890,
 2891,
 2892,
 2893,
 2560,
 2561,
 2562,
 2563,
 2564,
 2565,
 2566,
 2567,
 2568,
 55879,
 2569,
 2570,
 200959,
 11337,
 23710,
 11345,
 23766,
 2554,
 2555,
 2556,
 2557,
 2558,
 2559]

In [2]:
path = '/Users/Torben/Documents/richard_xyz/subIndLoc.mat'
ns_path = "/Users/Torben/Documents/ABI analysis/current_data_new/"
aba_path = '/Users/Torben/Documents/ABI analysis/normalized_microarray_donor9861/'
nsaba.Nsaba.ns_load(ns_path)
nsaba.Nsaba.aba_load(aba_path)
dat = scipy.io.loadmat(path)
N = nsaba.Nsaba()
#N.load_ge_pickle(pkl_file='/Users/Torben/Documents/ABI analysis/normalized_microarray_donor9861/Nsaba_ABA_ge.pkl')

This may take a minute or two ...
database.txt loaded.
features.txt loaded.
Nsaba.ns['mni_coords'] initialized.

This may take a minute or two ...
SampleAnnot.csv loaded.
MicroarrayExpression.csv loaded.
Probes.csv loaded.
Nsaba.aba['mni_coords'] initialized.



In [3]:
N.get_aba_ge(entrez_ids)

In [4]:
all_coords = [];
for sub in xrange(dat['coor_all'].shape[1]):
    temp_coords = tuple(dat['coor_all'][:,sub])
    all_coords.append([temp_coords,N.coords_to_ge([temp_coords],[gene1],search_radii=10,k=1),N.coords_to_ge([temp_coords],[gene2],search_radii=10,k=1)])

In [5]:
np.save('ei_gene_expression_coords_donor9861_closest_point_estimation',all_coords)

In [82]:
total=0
for x in xrange(len(all_coords)-1):
    if len(all_coords[x][1][0]) > 0:
        total +=1
print total

318


In [7]:
all_coords

[[(-50.0, 29.0, 33.0),
  array([], shape=(1, 0), dtype=float64),
  array([], shape=(1, 0), dtype=float64)],
 [(-54.0, 24.0, 25.0), array([[ 9.0334133]]), array([[ 8.80621525]])],
 [(-58.0, 20.0, 17.0), array([[ 9.25418032]]), array([[ 8.53468022]])],
 [(-59.0, 15.0, 9.0), array([[ 9.25418032]]), array([[ 8.53468022]])],
 [(-58.0, 8.0, 0.0), array([[ 9.13367406]]), array([[ 8.99115028]])],
 [(-59.0, 2.0, -7.0), array([[ 9.13367406]]), array([[ 8.99115028]])],
 [(-14.0, 13.0, -22.0), array([[ 9.23924859]]), array([[ 8.0557026]])],
 [(-16.0, 9.0, -27.0),
  array([], shape=(1, 0), dtype=float64),
  array([], shape=(1, 0), dtype=float64)],
 [(-18.0, 2.0, -34.0), array([[ 9.27513104]]), array([[ 8.51183483]])],
 [(-26.0, -2.0, -36.0), array([[ 9.10227015]]), array([[ 8.76778328]])],
 [(-36.0, -6.0, -36.0), array([[ 9.38836475]]), array([[ 8.69642316]])],
 [(-47.0, -8.0, -33.0), array([[ 9.65416816]]), array([[ 8.81064315]])],
 [(-58.0, -67.0, 24.0),
  array([], shape=(1, 0), dtype=float64),
