In [19]:
!git clone https://github.com/dimitryzub/scrape-google-scholar.git

fatal: destination path 'scrape-google-scholar' already exists and is not an empty directory.


In [20]:
!pip install -r scrape-google-scholar/requirements.txt



## Import 

In [1]:
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service
from selectolax.lexbor import LexborHTMLParser
import os, json
from typing import List, Dict, Callable
import time, random, re
import pandas as pd

import requests
import io
from io import BytesIO
from io import StringIO

import requests

import re
from bs4 import BeautifulSoup

import time 

In [2]:
def parse(parser: Callable, organic_results_data: Callable):
    '''
    Arugments:
    - parser:  Lexbor parser from scrape_google_scholar_organic_results() function.
    - organic_results_data: List to append data to. List origin location is scrape_google_scholar_organic_results() function. Line 104.
    
    This function parses data from Google Scholar Organic results and appends data to a List.
    
    It's used by scrape_google_scholar_organic_results().
    
    It returns nothing as it appends data to `organic_results_data`, 
    which appends it to `organic_results_data` List in the scrape_google_scholar_organic_results() function.
    '''
    
    for result in parser.css('.gs_r.gs_or.gs_scl'):
        try:
            title: str = result.css_first('.gs_rt').text()
        except: title = None

        try:
            title_link: str = result.css_first('.gs_rt a').attrs['href']
        except: title_link = None

        try:
            publication_info: str = result.css_first('.gs_a').text()
        except: publication_info = None

        try:
            snippet: str = result.css_first('.gs_rs').text()
        except: snippet = None

        try:
            # if Cited by is present in inline links, it will be extracted
            cited_by_link = ''.join([link.attrs['href'] for link in result.css('.gs_ri .gs_fl a') if 'Cited by' in link.text()])
        except: cited_by_link = None
        
        try:
            # if Cited by is present in inline links, it will be extracted and type cast it to integer
            cited_by_count = int(''.join([re.search(r'\d+', link.text()).group() for link in result.css('.gs_ri .gs_fl a') if 'Cited by' in link.text()]))
        except: cited_by_count = None
        
        try:
            pdf_file: str = result.css_first('.gs_or_ggsm a').attrs['href']
        except: pdf_file = None

        organic_results_data.append({
            'title': title,
            'title_link': title_link,
            'publication_info': publication_info,
            'snippet': snippet if snippet else None,
            'cited_by_link': f'https://scholar.google.com{cited_by_link}' if cited_by_link else None,
            'cited_by_count': cited_by_count if cited_by_count else None,
            'pdf_file': pdf_file
        })

def scrape_google_scholar_organic_results(
                                        query: str,
                                        pagination: bool = False,
                                        operating_system: str = 'Windows' or 'Linux',
                                        year_start: int = None,
                                        year_end: int = None,
                                        save_to_csv: bool = False, 
                                        save_to_json: bool = False
                                        ) -> List[Dict[str, str]]:
    '''
    Extracts data from Google Scholar Organic resutls page:
    - title: str
    - title_link: str
    - publication_info: str 
    - snippet: str
    - cited_by_link: str 
    - cited_by_count: int
    - pdf_file: str
    
    Arguments:
    - query: str. Search query. 
    - pagination: bool. Enables or disables pagination.
    - operating_system: str. 'Windows' or 'Linux', Checks for operating system to either run Windows or Linux verson of chromedriver
    
    Usage:
    data = scrape_google_scholar_organic_results(query='blizzard', pagination=False, operating_system='win') # pagination defaults to False 
    
    for organic_result in data:
        print(organic_result['title'])
        print(organic_result['pdf_file'])
    '''
    assert year_start and year_end
    if year_start or year_end:
        assert year_start and year_end
        assert year_start <= year_end

    # selenium stealth
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)
    
    # checks for operating system to either run Windows or Linux verson of chromedriver
    # expects to have chromedriver near the runnable file
    if operating_system is None:
        raise Exception('Please provide your OS to `operating_system` argument: "Windows" or "Linux" for script to operate.')
    
    if operating_system.lower() == 'windows' or 'win':
        driver = webdriver.Chrome(options=options, service=Service(executable_path='chromedriver.exe'))
    
    if operating_system.lower() == 'linux': 
        driver = webdriver.Chrome(options=options, service=Service(executable_path='chromedriver'))
    
    stealth(driver,
        languages=['en-US', 'en'],
        vendor='Google Inc.',
        platform='Win32',
        webgl_vendor='Intel Inc.',
        renderer='Intel Iris OpenGL Engine',
        fix_hairline=True,
    )
    
    page_num: int = 0
    organic_results_data: list = []
    
    if pagination:
        while True:
            # parse all pages
            driver.get(f'https://scholar.google.com/scholar?q={query}&hl=en&gl=us&start={page_num}&as_ylo={year_start}&as_yhi={year_end}')
            parser = LexborHTMLParser(driver.page_source)
            
            parse(parser=parser, organic_results_data=organic_results_data)
            
            # pagination
            if parser.css_first('.gs_ico_nav_next'):  # checks for the "Next" page button
                page_num += 10                        # paginate to the next page
                time.sleep(random.randint(1, 2))      # sleep between paginations
            else:
                break
    else:
        # parse single, first page
        driver.get(f'https://scholar.google.com/scholar?q={query}&hl=en&gl=us&start={page_num}')
        parser = LexborHTMLParser(driver.page_source)
    
        parse(parser=parser, organic_results_data=organic_results_data)
        
    if save_to_csv:
        pd.DataFrame(data=organic_results_data).to_csv('google_scholar_organic_results_data.csv', 
                                                        index=False, encoding='utf-8')
    if save_to_json:
        pd.DataFrame(data=organic_results_data).to_json('google_scholar_organic_results_data.json', 
                                                        index=False, orient='records')
    driver.quit()
    return organic_results_data

## Add target column from neuromab database

In [3]:
def AddTargetCol(df,final_df):
            
    merged_df = final_df.merge(df[['Clone', 'Target']], left_on=['Antibody'],right_on=['Clone'], how='left')
    merged_df = merged_df.drop('Clone', axis=1)
   
    return merged_df

## Grab Neuromab DataFrame from site

In [4]:
data = requests.get("https://neuromab.ucdavis.edu/catalog-download.cfm").content
df = pd.read_csv(BytesIO(data))

## Grab Catalog df for query creation

In [5]:
df2 = pd.read_excel('/Users/HMans_MacBook_Pro/Library/Mobile Documents/com~apple~CloudDocs/Desktop/TrimmerLab/runresults.xlsx')

## Create URL

In [6]:
def CreatURL(query):
    
    query1 = query[0:8]
    end_pos = query.find('/', 9)
    query2= query[9:end_pos]
    query3= query[end_pos+1:]
    
    url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + (query1 + '+' + query2 + '%' + '2F' + query3 + '&oq=neuromab+')

    return url

## Create Dataframe function 

In [7]:
def CreateDF(query,result_value,data):
    
    # Record the difference between search result # and data length 
    count = (int(result_value)) - (int({len(data)}.pop()))
    final_df = pd.DataFrame(data)
    
    # Get rid of extraneous strings from title 
    final_df['title'] = final_df['title'].str.replace('\[HTML\]\[HTML\]|\[PDF\]\[PDF\]|\[CITATION\]|\[C\]|\[BOOK\]\[BOOK\]|\[B\]|\[BOOK\]\[B\]', '')

    # Creat file for data 
    file_query = query.replace("/","_")
    file_query = file_query[0:8]+file_query[9:]
    filename = file_query + ".csv"
    directory = '/Users/HMans_MacBook_Pro/Library/Mobile Documents/com~apple~CloudDocs/Desktop/TrimmerLab/picklequeries'

    # Input Antibody name into table 
    final_df["Antibody"] = [query[9:]] * len(final_df)
    final_df = AddTargetCol(df,final_df)
    final_df.to_csv(os.path.join(directory, filename), index=False)
    
    # Input different result string based on difference in count 
    if count == 0:
        print(query + ' ' + 'All accounted for : DF saved to file')
        df2.loc[df2['Clone'] == query_name, 'Result'] = 'All accounted for : DF saved to file'

    elif (count < 4) and (count > 0) :
        print(query + ' ' +'There seems to be'+" "+str(count)+' '+"paper(s) w/o years")
        df2.loc[df2['Clone'] == query_name, 'Result'] = 'There seems to be'+" "+str(count)+' '+"paper(s) w/o years"

    else:
        print(query + ' ' +'Something went wrong with Search, too many papers not found')
        df2.loc[df2['Clone'] == query_name, 'Result'] = 'Something went wrong with Search, too many papers not found'

    return final_df

## Scholar Query : search into google scholar and grab data from query 

In [9]:
# Initialize a value to count 0 result pages 
zero_result_counter = 0

for query_name in df2.iloc[109:]['Clone']:
    data = {}
 
    # Set query for google scholar 
    query = 'neuromab' + ' ' + query_name

    # Grab data from google scholar 
    data = scrape_google_scholar_organic_results(query=query, pagination=True, operating_system="Windows", year_start=2005, year_end=2024)

    # create URL to grab results 
    url = CreatURL(query)
    url_info = requests.get(url)
    html = url_info.text

    # Create a BeautifulSoup object from the HTML string
    soup = BeautifulSoup(html, 'html.parser')
    result_elem = soup.select_one('#gs_ab_md .gs_ab_mdw')

    if result_elem:
        result_text = result_elem.get_text(strip=True)
        result_match = re.search(r'(\d+[\d,]*)\s+results', result_text)
        if result_match == None:
            result_match = re.search(r'(\d+[\d,]*)\s+result', result_text)

        if result_match:
            result_value = result_match.group(1)
            final_df = CreateDF(query,result_value,data)
        else:
            print(query + ' ' +"Search - No papers Available")
            df2.loc[df2['Clone'] == query_name, 'Result'] = "Search - No papers Available"
            
    else:
        print(query + ' ' +"No result element , we got dropped")
        df2.loc[df2['Clone'] == query_name, 'Result'] = "No result element , we got dropped"
        zero_result_counter = zero_result_counter + 1
        if zero_result_counter > 2:
            print('Going to bed on' + ' ' + query)
            time.sleep(3600)


KeyError: 'title'

In [9]:
df2.to_excel('/Users/HMans_MacBook_Pro/Library/Mobile Documents/com~apple~CloudDocs/Desktop/TrimmerLab/runresults.xlsx')

In [25]:
final_df

Unnamed: 0,title,title_link,publication_info,snippet,cited_by_link,cited_by_count,pdf_file,Antibody,Target
0,Metabolic regulation of Kv channels and cardia...,https://www.sciencedirect.com/science/article/...,"PJ Kilfoil, KC Chapalamadugu, X Hu, D Zhang… -...",… (NeuroMab; K17/70) primary antibodies in 200...,https://scholar.google.com/scholar?cites=27357...,17.0,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,K17/70,Kvbeta2/KCNAB2 K+ channel
1,Distribution of language‐related Cntnap2 prote...,https://onlinelibrary.wiley.com/doi/abs/10.100...,"MC Condro, SA White - Journal of Comparative N...","… NP_034728), conserved in zebra finch Neuroma...",https://scholar.google.com/scholar?cites=16321...,36.0,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,K17/70,Kvbeta2/KCNAB2 K+ channel
2,Stress-evoked tyrosine phosphorylation of sign...,https://www.jneurosci.org/content/30/31/10472....,"H Ohnishi, T Murata, S Kusakari, Y Hayashi… - ...",… Mouse mAbs to Kvβ2 (K17/70) and to Kv1.4 (K1...,https://scholar.google.com/scholar?cites=10583...,37.0,https://www.jneurosci.org/content/jneuro/30/31...,K17/70,Kvbeta2/KCNAB2 K+ channel
3,Selective loss of presynaptic potassium channe...,https://www.jneurosci.org/content/35/32/11433....,"MJ Kole, J Qian, MP Waase, TL Klassen… - Journ...","A specialized axonal ending, the basket cell “...",https://scholar.google.com/scholar?cites=18110...,22.0,https://www.jneurosci.org/content/jneuro/35/32...,K17/70,Kvbeta2/KCNAB2 K+ channel
4,K+ channel alterations in the progression of e...,https://www.sciencedirect.com/science/article/...,"PI Jukkola, AE Lovett-Racke, SS Zamvil, C Gu -...","… (clone #: K14/16, K17/70, and K13/31, respec...",https://scholar.google.com/scholar?cites=98209...,45.0,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,K17/70,Kvbeta2/KCNAB2 K+ channel
5,An αII spectrin-based cytoskeleton protects la...,https://www.jneurosci.org/content/37/47/11323....,"CYM Huang, C Zhang, DR Zollinger… - Journal of...","Axons must withstand mechanical forces, includ...",https://scholar.google.com/scholar?cites=14284...,53.0,https://www.jneurosci.org/content/jneuro/37/47...,K17/70,Kvbeta2/KCNAB2 K+ channel
6,Identification of voltage-gated K+ channel be...,https://www.sciencedirect.com/science/article/...,"C Bavassano, L Marvaldi, M Langeslag, B Sarg… ...",… Monoclonal anti-Kvβ2 antibody (clone K17/70)...,https://scholar.google.com/scholar?cites=12225...,23.0,https://www.sciencedirect.com/science/article/...,K17/70,Kvbeta2/KCNAB2 K+ channel
7,A toolbox of IgG subclass-switched recombinant...,https://elifesciences.org/articles/43322,"NP Andrews, JX Boeckman, CF Manning, JT Nguyen...",… We thank the current and former staff member...,https://scholar.google.com/scholar?cites=17050...,23.0,https://elifesciences.org/articles/43322.pdf,K17/70,Kvbeta2/KCNAB2 K+ channel
8,mTOR-dependent alterations of Kv1. 1 subunit ...,https://link.springer.com/content/pdf/10.1038/...,"LH Nguyen, AE Anderson - Scientific Reports, 2...",Cortical dysplasia (CD) is a common cause for ...,https://scholar.google.com/scholar?cites=68005...,25.0,https://link.springer.com/content/pdf/10.1038/...,K17/70,Kvbeta2/KCNAB2 K+ channel
9,Characterization of the axon initial segment ...,https://bmcbiol.biomedcentral.com/articles/10....,"A Duflocq, F Chareyre, M Giovannini… - BMC …, ...",The axon initial segment (AIS) plays a crucial...,https://scholar.google.com/scholar?cites=34662...,95.0,https://bmcbiol.biomedcentral.com/articles/10....,K17/70,Kvbeta2/KCNAB2 K+ channel


In [None]:
import os
import pandas as pd
df_list = []
path_to_files = ""
for file in os.listdir(path_to_files):
    query = pd.read_csv(f"{path_to_files}/{file})
    query["query"] = file.split(".")[0]
    df_list.append(pd.read_csv("file"))
final_df = pd.concat(df_list)