In [441]:
#!pip install selenium
#!pip install bs4
#!pip install fnmatch

In [438]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import InvalidElementStateException
import time
import pprint
import re
import csv
import fnmatch
import os

In [167]:
BASE_URL = 'https://scholar.google.com/citations'

In [442]:
def get_users(): 
    
    #links from spreadsheet, next time we can grab from wixs in future
    sla_authors = ['https://scholar.google.com/citations?user=v9rIqPQAAAAJ&hl=en',
    'https://scholar.google.com/citations?hl=en&user=igIKOZcAAAAJ&view_op=list_works&sortby=pubdate',
    'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=timothy+malloy+ucla&btnG=',
    'https://scholar.google.com/citations?user=gkygJ18AAAAJ&hl=en&oi=ao',
    'https://scholar.google.com/citations?user=1AG9uzsAAAAJ&hl=en',
    'https://scholar.google.com/citations?user=Rl5xhgwAAAAJ&hl=en&oi=ao',
    'https://scholar.google.com/citations?user=AfHMFccAAAAJ&hl=en&oi=ao',
    'https://scholar.google.com/citations?user=VXvek_AAAAAJ&hl=en&oi=sra',
    'https://scholar.google.com/citations?user=9CC9gHoAAAAJ&hl=en&oi=ao',
    'https://scholar.google.com/citations?user=rOvEiAYAAAAJ&hl=en',
    'https://scholar.google.com/citations?user=GVYNtNsAAAAJ&hl=en',
    'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=CW+Wong+ucla&btnG=',
    'https://scholar.google.com/citations?user=ceCfTvcAAAAJ&hl=en&oi=ao',
    'https://scholar.google.com/citations?user=kvSIKM8AAAAJ&hl=en&oi=ao']

    user_id_regex = re.compile("user=(.{12})&")

    user_id = [m.group(1) for a in sla_authors for m in [user_id_regex.search(a)] if m]
    #[au in au for sla_authors]

    #print(user_id)
    return user_id

In [455]:
def get_full_user_page(user):
    """
    Will click to show all of the articles.
    
    :param user: an ID for a google scholar user
    :returns: returns a list of articles
    
    """
    
    url = BASE_URL + "?user=" + user
    
    driver.get(url)

    pages_remaining = True

    while pages_remaining:

        try:
            #Checks if there are more pages with links
            driver.find_element_by_id('gsc_bpf_more').click()         
            time.sleep(5)
            #print('pages_remaining ', pages_remaining)
        except InvalidElementStateException:
            gscholar_citations = driver.page_source
            pages_remaining = False
            #print('pages_remaining ', pages_remaining)
    
    #driver.quit()
    
    return gscholar_citations


In [444]:
def get_author_citations_links(page):
    """
    Will take a google sholar profile page and convert to BeautifulSoup object, 
    then extract the author & citation links. Then returns as a list. 
    
    :param page: a google scholar author page
    :returns: returns a list that contains the name of the author and the citation links
    
    """
    #make a soup object
    gscholar_soup = BeautifulSoup(page, 'lxml')
    
   
    author = gscholar_soup.find(id="gsc_prf_in").get_text()
    #author_cite_links.append(author)
    citation_links = [link['data-href'] for link in gscholar_soup.select("a.gsc_a_at")]
    
    #get the IDs from the author_cite_lists 
    citation_id_regex = re.compile("citation_for_view=(.{25})")
    citation_ids = [m.group(1) for c in citation_links for m in [citation_id_regex.search(c)] if m]
    
    return author, citation_ids

In [457]:
def get_citations(author, cids):
    """
    Takes a list that contains the author and her citation page links. Opens and scrapes
    citation pages and builds a dictionary of the author and citations. Returns the dict.
    
    :param links: a list containing author and another list of citation pages.
    :returns: returns a dictionary with the author and their citations.
    """
        
    author_bib = []
    
    for l in cids: 
        print(l)
        #citation_for_view=igIKOZcAAAAJ:tntj4plCNvAC
        search_params = {'view_op': 'view_citation',
                         'h1': 'en',
                         'citation_for_view':l}
        cite_page = requests.get(BASE_URL, params=search_params)
        #print(cite_page.url)
        cpage_soup = BeautifulSoup(cite_page.text, 'lxml')
        #link & title = a gsc_vcd_title_link
        #authors = div gsc_vcd_value
        #date = div gsc_vcd_value
        #need to filter out patents and filter in by a group of words
        if cpage_soup.select_one('div#gsc_ocd_view').text in ['Investors', 'Conference']:
            continue 
        else:
            citation = {}
            citation['title'] = cpage_soup.select_one("a.gsc_vcd_title_link").text
            citation['url'] = cpage_soup.select_one("a.gsc_vcd_title_link")['href']
            citation['sla_author'] = author
            citation['compact_authors'] = cpage_soup.select('div.gsc_vcd_merged_snippet div')[1].text.replace(u'\xa0', u'')
            citation['authors'] = cpage_soup.select('div.gsc_vcd_value')[0].text
            citation['date'] = cpage_soup.select('div.gsc_vcd_value')[1].text
            citation['publication'] = cpage_soup.select('div.gsc_vcd_value')[2].text
        
        time.sleep(3)

        author_bib.append(citation)
    return author_bib

In [446]:
def write_out_csv(citations):
    '''Takes a list of dictionaries and writes out as a csv file'''
    
    filename = "sla_bib.csv"
    
    if os.path.exists(filename):
        append_write = 'a' # append if already exists
    else:
        append_write = 'w' # make a new file if not
    
    with open('sla-bibliographies.csv', append_write) as sb:

        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for d in cites:
            writer.writerow(d)

### run the code below
1. start webdriver, it will stay active till you driver.quit() 
  * TODO: run headless 
2. run loop that will grab uids from list

In [461]:
driver = webdriver.Firefox()

In [None]:
for uid in get_users()[:2]:
    print(uid)
    page = get_full_user_page(uid)
    #show_all_articles on profile page
    author, citation_ids = get_author_citations_links(page)
    citations = get_citations(author, citation_ids)
    write_out_csv(citations)

v9rIqPQAAAAJ
v9rIqPQAAAAJ:u5HHmVD_uO8C
v9rIqPQAAAAJ:u-x6o8ySG0sC
v9rIqPQAAAAJ:d1gkVwhDpl0C
v9rIqPQAAAAJ:9yKSN-GCB0IC
v9rIqPQAAAAJ:2osOgNQ5qMEC
v9rIqPQAAAAJ:UeHWp8X0CEIC
v9rIqPQAAAAJ:qjMakFHDy7sC
v9rIqPQAAAAJ:LkGwnXOMwfcC
v9rIqPQAAAAJ:IjCSPb-OGe4C
v9rIqPQAAAAJ:zYLM7Y9cAGgC
v9rIqPQAAAAJ:Y0pCki6q_DkC
v9rIqPQAAAAJ:Tyk-4Ss8FVUC
v9rIqPQAAAAJ:W7OEmFMy1HYC
v9rIqPQAAAAJ:YsMSGLbcyi4C
v9rIqPQAAAAJ:eQOLeE2rZwMC
v9rIqPQAAAAJ:lmc2jWPfTJgC
v9rIqPQAAAAJ:WF5omc3nYNoC
v9rIqPQAAAAJ:3fE2CSJIrl8C
v9rIqPQAAAAJ:0EnyYjriUFMC
v9rIqPQAAAAJ:ufrVoPGSRksC
v9rIqPQAAAAJ:_FxGoFyzp5QC
v9rIqPQAAAAJ:Se3iqnhoufwC
v9rIqPQAAAAJ:UebtZRa9Y70C
v9rIqPQAAAAJ:roLk4NBRz8UC
v9rIqPQAAAAJ:5nxA0vEk-isC
v9rIqPQAAAAJ:hqOjcs7Dif8C
v9rIqPQAAAAJ:pqnbT2bcN3wC
v9rIqPQAAAAJ:MXK_kJrjxJIC
v9rIqPQAAAAJ:8k81kl-MbHgC
v9rIqPQAAAAJ:kNdYIx-mwKoC
v9rIqPQAAAAJ:Zph67rFs4hoC
v9rIqPQAAAAJ:KlAtU1dfN6UC
v9rIqPQAAAAJ:YOwf2qJgpHMC
v9rIqPQAAAAJ:4TOpqqG69KYC
v9rIqPQAAAAJ:URolC5Kub84C
v9rIqPQAAAAJ:_kc_bZDykSQC
v9rIqPQAAAAJ:ULOm3_A8WrAC
v9rIqPQAAAAJ:zLWjf1WUPmwC

In [460]:
driver.quit()

In [432]:
cites = get_citations(author, citation_list[:10])

In [209]:
author, citation_list = get_author_citations_links(page)

In [312]:
keywords_to_include = ['renewable energy', 'electric vehicles', 'battery', 'energy storage', 'EV', 'solar power',
'urban', 'North America', 'avian', 'biodiversity', 'environment', 'climate', 'solar energy', 'thermal energy', 
'electronics', 'photonics', 'batteries', 'solar', 'energy', 'urban*', 'development'] 

include_words = '''los angeles, california, environment, water
renewable, energy, algal, biofuels, biomass, fuel
climate, california, biodiversity, north america*, native
avian, biodiversity, environment, climate
transportation networks, ZEV, zero emission vehicles, public transit
thermal, battery, storage, energy, solar, wind
solar, film, transparent, power, energy, photovoltaic
aerosols, air pollution, ultrafine particles, air quality, PM2.5, PM10, ambient'''

In [433]:
cites

[{'authors': 'N Shyamsundar, Rajit Gadh',
  'compact_authors': 'N Shyamsundar, R Gadh- Computer-aided design, 2001',
  'date': '2001/8/1',
  'publication': 'Computer-aided design',
  'sla_author': 'Rajit Gadh',
  'title': 'Internet-based collaborative product design with assembly features and virtual design spaces',
  'url': 'https://www.sciencedirect.com/science/article/pii/S0010448501000690'},
 {'authors': 'N Shyamsundar, Rajit Gadh',
  'compact_authors': 'N Shyamsundar, R Gadh- Computer-Aided Design, 2002',
  'date': '2002/9/1',
  'publication': 'Computer-Aided Design',
  'sla_author': 'Rajit Gadh',
  'title': 'Collaborative virtual prototyping of product assemblies over the Internet',
  'url': 'https://www.sciencedirect.com/science/article/pii/S0010448501002044'},
 {'authors': 'Rajit Gadh, Tushar H Dani',
  'compact_authors': 'R Gadh, TH Dani- US Patent 6,629,065, 2003',
  'date': '2003/9/30',
  'publication': 'US',
  'sla_author': 'Rajit Gadh',
  'title': 'Methods and apparata for

In [434]:
fieldnames = list(cites[0].keys())
with open('cites-test.csv', 'w', newline='') as csvfile:
    #fieldnames = ['first_name', 'last_name']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for d in cites:
        writer.writerow(d)

In [435]:
cat cites-test.csv

compact_authors,title,publication,date,sla_author,url,authors
"N Shyamsundar, R Gadh- Computer-aided design, 2001",Internet-based collaborative product design with assembly features and virtual design spaces,Computer-aided design,2001/8/1,Rajit Gadh,https://www.sciencedirect.com/science/article/pii/S0010448501000690,"N Shyamsundar, Rajit Gadh"
"N Shyamsundar, R Gadh- Computer-Aided Design, 2002",Collaborative virtual prototyping of product assemblies over the Internet,Computer-Aided Design,2002/9/1,Rajit Gadh,https://www.sciencedirect.com/science/article/pii/S0010448501002044,"N Shyamsundar, Rajit Gadh"
"R Gadh, TH Dani- US Patent 6,629,065, 2003",Methods and apparata for rapid computer-aided design of objects in virtual reality and other environments,US,2003/9/30,Rajit Gadh,https://patents.google.com/patent/US6629065B1/en,"Rajit Gadh, Tushar H Dani"
"H Ramamurthy, BS Prabhu, R Gadh, AM Madni- IEEE sensors journal, 2007",Wireless industrial monitoring and control using a smart 

In [430]:
authors = cpage_soup.select('div.gsc_vcd_value')[0].text
date = cpage_soup.select('div.gsc_vcd_value')[1].text
publication = cpage_soup.select('div.gsc_vcd_value')[2].text

print(authors, date, publication)

N Shyamsundar, Rajit Gadh 2001/8/1 Computer-aided design


In [213]:
import re
regex = re.compile('th.s')
l = ['this', 'is', 'just', 'a', 'test']

In [214]:
matches = [string for string in l if re.match(regex, string)]

In [215]:
matches

['this']

In [291]:
search_params = {'view_op': 'view_citation',
                         'h1': 'en',
                         'citation_for_view':'igIKOZcAAAAJ:tntj4plCNvAC'}
cpage = requests.get(BASE_URL, params=search_params)

In [286]:
cpage = requests.get('https://scholar.google.com/citations?view_op=view_citation&hl=en&citation_for_view=igIKOZcAAAAJ:tntj4plCNvAC')

In [292]:
cpage.url

'https://scholar.google.com/citations?citation_for_view=igIKOZcAAAAJ%3Atntj4plCNvAC&view_op=view_citation&h1=en'

In [314]:
c_page_soup = BeautifulSoup(cpage.text, 'lxml')

In [406]:
if cpage_soup.select_one('div#gsc_ocd_view').text in ['Investor', 'Conference']:
    print('yes')

In [315]:
c_page_soup.select('a.gsc_vcd_title_link')[0].text

'Ge/Si nanowire heterostructures as high-performance field-effect transistors'

In [295]:
c_page_soup.select_one('a.gsc_vcd_title_link')['href']

'https://www.nature.com/articles/nature04796'

In [None]:
https://scholar.google.com/citations?view_op=view_citation&hl=en&user=igIKOZcAAAAJ&citation_for_view=igIKOZcAAAAJ:tntj4plCNvAC

In [343]:
'N Shyamsundar, R Gadh\xa0- Computer-aided design, 2001'.replace(u'\xa0', u'')

'N Shyamsundar, R Gadh- Computer-aided design, 2001'

In [351]:
bibdict = {'Authors': 'SC-Y Lu, M Shpitalni, Rajit Gadh',
  'Issue': '2',
  'Journal': 'CIRP Annals',
  'Pages': '471-495',
  'Publication date': '1999/1/1',
  'Volume': '48',
  'compact_authors': 'SCY Lu, M Shpitalni, R Gadh- CIRP Annals, 1999',
  'sla_author': 'Rajit Gadh',
  'title': 'Virtual and augmented reality technologies for product realization',
  'url': 'https://www.sciencedirect.com/science/article/pii/S0007850607632296'}

In [357]:
w = csv.writer(open("output.csv", "w"))
for key, val in bibdict.items():
    w.writerow([key, val])

In [358]:
!ls

README.md           geckodriver.log     practice.ipynb      sla-citations.ipynb
example.html        output.csv          sla-authors.txt


In [359]:
!cat output.csv

url,https://www.sciencedirect.com/science/article/pii/S0007850607632296
compact_authors,"SCY Lu, M Shpitalni, R Gadh- CIRP Annals, 1999"
Publication date,1999/1/1
Volume,48
Issue,2
sla_author,Rajit Gadh
title,Virtual and augmented reality technologies for product realization
Pages,471-495
Authors,"SC-Y Lu, M Shpitalni, Rajit Gadh"
Journal,CIRP Annals


In [356]:
type(bibdict)

dict

In [361]:
csv.

Object `tell` not found.


In [362]:
import csv
toCSV = [{'name':'bob','age':25,'weight':200},
         {'name':'jim','age':31,'weight':180}]
keys = toCSV[0].keys()

In [370]:
list(keys)

['weight', 'age', 'name']