# GNET Insight Scraper

Code to scrape GNET insight post contents and relevant metadata from posts on GNET's research blog at https://gnet-research.org

In [None]:
import requests
from bs4 import BeautifulSoup
import typing as t
import pickle
from datetime import datetime
import json
from os import walk
import pandas as pd

## Scrape URLs of each insight post from the Insights index pages

In [None]:
def get_insight_urls(page_url: str) -> t.List[str]:
    print(f"Fetching: {page_url}")
    search_class = "link-to-post"
    page = requests.get(page_url)
    bs = BeautifulSoup(page.content, "html.parser")
    elements = bs.find_all(class_=search_class)
    href = [x["href"] for x in elements]
    href = list(dict.fromkeys(href))        # De-duplicate URLs
    print(f"Got {len(href)} insight URLs")
    return href

def scrape_insight_urls(total_pages: int):
    # Page URL format
    # https://gnet-research.org/resources/insights/page/10
    url_prefix = "https://gnet-research.org/resources/insights/page/"
    page_range = range(1, total_pages + 1)
    insight_urls = []
    for n in page_range:
        page_url = f"{url_prefix}{n}"
        page_insight_urls = get_insight_urls(page_url)
        insight_urls.extend(page_insight_urls)

    insight_urls = list(dict.fromkeys(insight_urls))        # De-duplicate URLs
    return insight_urls

insight_urls = scrape_insight_urls(total_pages=18)

print(len(insight_urls))

## Scrape and extract data content from each insight post

Insight data contents will be saved to individual pickle files to facilitate pausing and restarting downloads partway through the scraping process.

In [None]:
# Content to scrape from each page:

# Insight URL
# Title
# Authors
# Author URLs
# Pub date
# Category
# Text
# Insight URLs
# Tags

# Get authors information
def extract_authors(bs: BeautifulSoup):
    author_names = []
    author_urls = []
    authors_elements = bs.find_all(class_="author url fn")
    for author_elem in authors_elements:
        author_name = author_elem.contents[0]
        author_url = author_elem["href"]
        author_names.append(author_name)
        author_urls.append(author_url)
    return {"author_names": author_names, "author_urls": author_urls}

def extract_title(bs: BeautifulSoup):
    title_elem = bs.find(class_="entry-title")
    title = title_elem.contents[0]
    title = title.replace('\xa0', ' ').strip()
    return {"title": title}

def extract_pub_date(bs):
    date_elem = bs.find("time")
    pub_date = date_elem["datetime"]
    return {"pub_date": pub_date}

def extract_categories(bs: BeautifulSoup):
    categories_elem = bs.find(class_="categories")
    categories = categories_elem.find_all("a")
    cats = []
    for cat_elem in categories:
        category = cat_elem.contents[0]
        cats.append(category)
    return {"categories": cats}

def extract_text_and_urls(bs: BeautifulSoup):
    insight_text = []
    insight_urls = []

    mailmunch_elem = bs.find(class_="mailmunch-forms-before-post")
    for sib in mailmunch_elem.next_siblings:
        if sib.has_attr('class') and 'mailmunch-forms-after-post' in sib["class"]:
            break

        text_string = sib.get_text()
        text_string = text_string.replace('\xa0', ' ').strip()
        insight_text.append(text_string)
        hrefs = sib.findChildren("a")
        for href in hrefs:
            if href.has_attr('href'):
                insight_urls.append(href["href"])

    return {"insight_text": insight_text, "insight_urls": insight_urls}

def extract_tags(bs):
    tags = []
    tags_element = bs.find(class_="meta-info-container")
    if (tags_element is not None):
        anchors = tags_element.find_all("a")
        for a in anchors:
            tags.append(a.string)
    return {"tags": tags}

def scrape_insight_data(insight_url: str):
    print(f"Scraping URL: {insight_url}")
    data_map = {"insight_url": insight_url}
    page = requests.get(insight_url)
    bs = BeautifulSoup(page.content, "html.parser")

    title_info = extract_title(bs)
    author_info = extract_authors(bs)
    pub_date_info = extract_pub_date(bs)
    category_info = extract_categories(bs)
    text_info = extract_text_and_urls(bs)
    tags_info = extract_tags(bs)

    data_map.update(title_info)
    data_map.update(author_info)
    data_map.update(pub_date_info)
    data_map.update(category_info)
    data_map.update(text_info)
    data_map.update(tags_info)
    return data_map

def write_data_file(data):
    import os
    directory = "data/"
    os.makedirs(directory, exist_ok = True)
    title = data["title"]
    title = title.replace("/","_")
    timestamp = round(datetime.now().timestamp())
    filename = f"{directory}{title}_{timestamp}.pickle"
    outfile = open(filename, 'wb')
    json_data = json.dumps(data)
    pickle.dump(json_data, outfile)
    outfile.close()
    print(f"Wrote data to file: {filename}")

for i in range(1, len(insight_urls)):
    insight_url = insight_urls[i]    
    data = scrape_insight_data(insight_url)
    write_data_file(data)

## Load the saved pickle files into a dataframe, do initial data cleanup, and save to the dataset file

In [None]:
files = []
for (dirpath, dirnames, filenames) in walk("data/"):
    files.extend(filenames)
    break

insight_data = []
for filename in files:
    infile = open(f"data/{filename}", 'rb')
    pickle_data = pickle.load(infile)
    data = json.loads(pickle_data)
    insight_data.append(data)

df = pd.DataFrame(insight_data)
df["pub_date"] = pd.to_datetime(df["pub_date"])
df["insight_text"] = df["insight_text"].apply(lambda x: " ".join(x))

df.to_csv("gnet_insights.csv", index=False)