In [1]:
import requests
from bs4 import BeautifulSoup

subject_search_words = "Climate Change"
search_word = "+".join(subject_search_words.lower().split(" "))
url = f"https://open.umn.edu/opentextbooks/textbooks?q={search_word}"

# Making a GET request
r = requests.get(url)

print(r)

# Parsing the HTML
soup = BeautifulSoup(r.content, 'html.parser')
entries = soup.find_all('entry')
entries_obj = []
for entry in entries:
    title = entry.find("title").text
    textbook_page_url = entry.find("link")["href"]
    entries_obj.append({"title": title, "textbook_page_url":textbook_page_url})
                       
print(entries_obj)

<Response [200]>
[{'title': 'Bending the Curve: Climate Change Solutions', 'textbook_page_url': 'https://open.umn.edu/opentextbooks/textbooks/bending-the-curve-climate-change-solutions'}, {'title': 'Community Resilience to Climate Change: Theory, Research and Practice', 'textbook_page_url': 'https://open.umn.edu/opentextbooks/textbooks/community-resilience-to-climate-change-theory-research-and-practice-hellman'}, {'title': 'Negotiating Climate Change in Crisis', 'textbook_page_url': 'https://open.umn.edu/opentextbooks/textbooks/negotiating-climate-change-in-crisis'}, {'title': 'Introduction to Climate Science - 1st Edition', 'textbook_page_url': 'https://open.umn.edu/opentextbooks/textbooks/introduction-to-climate-science-1st-edition-schmittner'}, {'title': 'Climate Toolkit: A Resource Manual for Science and Action - Version 2.0', 'textbook_page_url': 'https://open.umn.edu/opentextbooks/textbooks/climate-toolkit-a-resource-manual-for-science-and-action-granshaw'}, {'title': 'Permacultu

  k = self.parse_starttag(i)


In [None]:
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.prettify())

# Scraping Google Scholar

In [3]:
subject = "climate change"
start_page = 0
start_year = 2020

subject_str = "+".join(subject.split(" "))

url = f"https://scholar.google.com/scholar?start={start_page*10}&q={subject_str}&hl=en&as_sdt=0,6&as_ylo={start_year}"
r = requests.get(url)

soup = BeautifulSoup(r.content, 'html.parser')
pdf_links_elements = soup.find_all("div", class_="gs_or_ggsm")
pdf_title_elements = soup.find_all("h3", class_="gs_rt")

# print(soup.prettify())
pdf_links, pdf_titles = [], []

for link_e, title_e in zip(pdf_links_elements, pdf_title_elements):
    link = link_e.find("a")["href"]
    title = title_e.find("a").text
    pdf_links.append(link)
    pdf_titles.append(title)

print(pdf_links, pdf_titles)

['https://link.springer.com/article/10.1007/s11356-022-19718-6', 'https://onlinelibrary.wiley.com/doi/am-pdf/10.1111/gcb.15569', 'https://edisciplinas.usp.br/pluginfile.php/8026068/mod_resource/content/1/REF01.pdf', 'https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_FrontMatter.pdf', 'https://www.pnas.org/doi/full/10.1073/pnas.2108146119?utm_campaign=Hot+News&utm_medium=email&_hsmi=221546516&_hsenc=p2ANqtz-87GNCqvTbaZKuJmpWreWTET3Au_1Kf1C3kEQZyOaR4MWXITkEIkggztscB5ZGXCZphpSiCTy0AOpQ_1I0iLebtCt82oQ&utm_content=221546516&utm_source=hs_email', 'https://link.springer.com/article/10.1007/s10311-020-01059-w', 'https://www.thelancet.com/journals/lanplh/article/PIIS2542-5196(21)00278-3/fulltext?ref=f-zin.faktograf.hr', 'https://www.nature.com/articles/s41579-021-00639-z.'] ['A review of the global climate change impacts, adaptation, and sustainable mitigation measures', 'Forest microclimates and climate change: Importance, drivers and future research agenda', 'Renewable energy a

In [4]:
def get_entries_at_page(start_page, subject, start_year):
    
    subject_str = "+".join(subject.lower().split(" "))
    url = f"https://scholar.google.com/scholar?start={start_page*10}&q={subject_str}&hl=en&as_sdt=0,6&as_ylo={start_year}"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    entries_e = soup.find_all("div", class_="gs_r gs_or gs_scl")
    entries = []
    
    for entry_e in entries_e:
        link_e = entry_e.find("div", class_="gs_or_ggsm")
        if not link_e:
            continue
        file_type_str = link_e.find("span").text
        title_e = entry_e.find("h3", class_="gs_rt")
        link = link_e.find("a")["href"]
        title = title_e.find("a").text
        if "HTML" in file_type_str: file_type = "HTML"
        if "PDF" in file_type_str: file_type = "PDF"
        entries.append({"title": title, "file_type":file_type, "link":link})

    return entries

In [5]:
from tqdm import tqdm

def get_entries(max_page, subject, start_year):
    entries = []
    for start_page in tqdm(range(max_page)):
    
        entries_page = get_entries_at_page(start_page, subject, start_year)
        entries.extend(entries_page)
    
    return entries

In [6]:
import pandas as pd

entries = get_entries(10, "climate change", 2020)
entries_df = pd.DataFrame.from_records(entries)
entries_df["file_type"].value_counts()

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.02it/s]


file_type
PDF     49
HTML    44
Name: count, dtype: int64

In [11]:
groups = entries_df.groupby("file_type")

html_files = list(groups)[0][1]
html_files.head()

Unnamed: 0,title,file_type,link
0,"A review of the global climate change impacts,...",HTML,https://link.springer.com/article/10.1007/s113...
4,Climate Endgame: Exploring catastrophic climat...,HTML,https://www.pnas.org/doi/full/10.1073/pnas.210...
5,Strategies for mitigation of climate change: a...,HTML,https://link.springer.com/article/10.1007/s103...
6,Climate anxiety in children and young people a...,HTML,https://www.thelancet.com/journals/lanplh/arti...
7,Infectious disease in an era of global change,HTML,https://www.nature.com/articles/s41579-021-006...


In [21]:
websites = [link.replace("https://", "").replace("http://", "").split("/")[0] for link in html_files["link"]]
websites_unique = list(set(websites))
len(websites), websites_unique

(44,
 ['www.cell.com',
  'journals.sagepub.com',
  'www.frontiersin.org',
  'royalsocietypublishing.org',
  'link.springer.com',
  'www.nejm.org',
  'www.mdpi.com',
  'journals.ametsoc.org',
  'academic.oup.com',
  'www.science.org',
  'www.ncbi.nlm.nih.gov',
  'www.nature.com',
  'www.annualreviews.org',
  'iopscience.iop.org',
  'www.pnas.org',
  'www.sciencedirect.com',
  'www.thelancet.com'])