Google Scholar and Semantic Scholar scraping. 

In [1]:
# Packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import regex as re


# Request to Google Scholar
url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q={Computational+Propaganda}&btnG='
response=requests.get(url)

# Parse the HTML content
soup=BeautifulSoup(response.content,'html.parser')

In [12]:
# Get the title of the article
results1 = []
for entry in soup.find_all("h3", attrs={"class": "gs_rt"}):
    print(entry.a.text)
    results1.append({"Title": entry.a.text})    
dataframe1 = pd.DataFrame(results1)
dataframe1

Computational propaganda worldwide: Executive summary
A survey on computational propaganda detection
Computational propaganda in Germany: A cautionary tale
Political communication, computational propaganda, and autonomous agents: Introduction
Computational propaganda: Political parties, politicians, and political manipulation on social media
Computational propaganda in the United States of America: Manufacturing consensus online
Computational propaganda in Brazil: Social bots during elections
Computational propaganda and political big data: Moving toward a more critical research agenda
Computational propaganda in Russia: The origins of digital misinformation
Computational propaganda: If you make it trend, you make it true


Unnamed: 0,Title
0,Computational propaganda worldwide: Executive ...
1,A survey on computational propaganda detection
2,Computational propaganda in Germany: A caution...
3,"Political communication, computational propaga..."
4,"Computational propaganda: Political parties, p..."
5,Computational propaganda in the United States ...
6,Computational propaganda in Brazil: Social bot...
7,Computational propaganda and political big dat...
8,Computational propaganda in Russia: The origin...
9,Computational propaganda: If you make it trend...


In [13]:
# Get Authors, year, journal
results2 = []
for entry in soup.find_all("div", attrs={"class": "gs_a"}):
    year = re.findall('[0-9]+', entry.text)
    journal = re.findall('[^-]+$', entry.text)
    authors = re.findall('(^(.+?) - )', entry.text)
    results2.append({"Authors": authors, "Year": year, "Journal": journal})
dataframe2 = pd.DataFrame(results2)


# Clean the data
dataframe2['Authors'] = dataframe2['Authors'].str.get(0)
dataframe2['Authors'] = dataframe2['Authors'].str.get(0)
dataframe2['Journal'] = dataframe2['Journal'].str.get(0)
dataframe2['Year'] = dataframe2['Year'].str.get(0)

# Get the authors
dataframe_temp = dataframe2['Authors'].str.extract('(^(.+?)- )')
dataframe_temp.drop(dataframe_temp.columns[[0]], axis=1, inplace=True)

# Concat the dataframes
dataframe2 = pd.concat([dataframe_temp, dataframe2], axis=1)
dataframe2.rename(columns={ dataframe2.columns[0]: "Author" }, inplace = True)
dataframe2 = dataframe2.drop('Authors', axis=1)

In [14]:
# Get number of citations for each paper
results3 = []
for entry in soup.find_all("div", class_="gs_ri"):
    txt_cite = entry.find("div", class_="gs_fl").find_all("a")[2].string
    if txt_cite:
        citations = re.findall('[0-9]+', txt_cite)
        if citations:
            results3.append({"Number of citation": citations})
        else:
            results3.append({"Number of citation": 0})
    else:
        results3.append({"Number of citation": 0})
dataframe3 = pd.DataFrame(results3)
# Clean the data
dataframe3['Number of citation'] = dataframe3['Number of citation'].str.get(0)

In [15]:
# Get the link
results4 = []

for entry in soup.find_all('h3', class_ = 'gs_rt'):
    results4.append({"Link": entry.a['href']})
dataframe4 = pd.DataFrame(results4)

In [16]:
# Concat the dataframes and clean the dataset
dataframe = pd.concat([dataframe1, dataframe2,dataframe3,dataframe4], axis=1)
dataframe.to_csv('Google Scholar - publications', sep='\t')
dataframe

Unnamed: 0,Title,Author,Year,Journal,Number of citation,Link
0,Computational propaganda worldwide: Executive ...,"SC Woolley, P Howard",2017,ora.ox.ac.uk,230,https://ora.ox.ac.uk/objects/uuid:d6157461-aef...
1,A survey on computational propaganda detection,"GDS Martino, S Cresci, A Barrón-Cedeño, S Yu…",2020,arxiv.org,97,https://arxiv.org/abs/2007.08024
2,Computational propaganda in Germany: A caution...,LMN Neudert,2017,demtech.oii.ox.ac.uk,48,https://demtech.oii.ox.ac.uk/wp-content/upload...
3,"Political communication, computational propaga...","SC Woolley, PN Howard",2016,par.nsf.gov,249,https://par.nsf.gov/biblio/10021331
4,"Computational propaganda: Political parties, p...","SC Woolley, PN Howard",2018,books.google.com,463,https://books.google.com/books?hl=en&lr=&id=qT...
5,Computational propaganda in the United States ...,"SC Woolley, D Guilbeault",2017,ora.ox.ac.uk,129,https://ora.ox.ac.uk/objects/uuid:620ce18f-69e...
6,Computational propaganda in Brazil: Social bot...,D Arnaudo,2017,ora.ox.ac.uk,135,https://ora.ox.ac.uk/objects/uuid:e88de32c-baa...
7,Computational propaganda and political big dat...,"G Bolsover, P Howard",2017,liebertpub.com,97,https://www.liebertpub.com/doi/full/10.1089/bi...
8,Computational propaganda in Russia: The origin...,S Sanovich,2017,ora.ox.ac.uk,116,https://ora.ox.ac.uk/objects/uuid:555c1e20-60d...
9,Computational propaganda: If you make it trend...,R DiResta,2018,muse.jhu.edu,30,https://muse.jhu.edu/pub/1/article/791857/summary


In [17]:
# Extract suggested results
results_suggestion = []
for entry in soup.find_all("div", attrs={"class": "gs_qsuggest gs_qsuggest_regular"}):
    for item in entry.find_all("li"):
        results_suggestion.append({"Suggestion": item.text})
dataframe_suggestion = pd.DataFrame(results_suggestion)
dataframe_suggestion.to_csv('Google Scholar - suggestions', sep='\t')
dataframe_suggestion

Unnamed: 0,Suggestion
0,computational propaganda project
1,chinese computational propaganda
2,oxford computational propaganda
3,computational propaganda political communication
4,computational propaganda autonomous agents
5,computational propaganda worldwide
6,computational propaganda uk eu referendum
7,computational propaganda bots
