In [None]:
# Initial idea: scraping BBC Science articles. But BBC Science uses javacript to load its sub-pages, which means you need Selenium.
# Instead, let's use a static HTML site - like the Guardian. We know it's static because page numbers appear in the URL.

In [9]:
# Import scraping libraries
# requests - sends HTTP requests to webpages, fetches content
# BeautifulSoup - parses the content of a webpage
import os
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime

In [27]:
# Set the base directory to the repository root
code_dir = Path().resolve()
base_dir = Path().resolve().parent
output_dir = base_dir / "output"

In [37]:
print(f"Repo root: {base_dir} \nCode: {code_dir} \nOutput: {output_dir}")

Repo root: /Users/sunny/Documents/GitHub/nlp-analysis 
Code: /Users/sunny/Documents/GitHub/nlp-analysis/code 
Output: /Users/sunny/Documents/GitHub/nlp-analysis/output


***Start scraping***

In [None]:
# Step 1: Define the base URL and headers
base_url = "https://www.theguardian.com/lifeandstyle/relationships"
headers = {"User-Agent": "Mozilla/5.0"}  # Prevent blocking by identifying as a browser

# Step 2: Loop through multiple pages
all_articles = []
for page in range(1, 100): 
    url = f"{base_url}?page={page}"  # Construct the paginated URL
    response = requests.get(url, headers=headers)

    if response.status_code == 200:  # Check if the request was successful
        soup = BeautifulSoup(response.text, 'html.parser')

        # Step 3: Extract article links and titles
        articles = soup.find_all('a', class_='dcr-ezvrjj') 
        for article in articles:
            title = article.get('aria-label', 'No title found')
            link = "https://www.theguardian.com" + article['href']  # The Guardian uses absolute URLs
            all_articles.append((title, link))
    else:
        print(f"Failed to fetch page {page}. Status code: {response.status_code}")
        break

In [None]:
# Step 4: Verify that article titles and links are correct
for idx, (title, link) in enumerate(all_articles, start=1):
    if idx<10:
        print(f"{idx}. {title}\n   {link}")

In [25]:
# Define function to scrape article text including drop capitals
def get_article_text_with_links(soup):
    # Create an empty string to store the text
    article_text = ''
    
    # Loop through all paragraphs
    for p in soup.find_all('p', class_=['dcr-15rw6c2', 'dcr-s3ycb2']):
        # Check if there's a <span> with the drop capital class within the paragraph
        drop_capital_span = p.find('span', class_='dcr-15rw6c2')
        
        if drop_capital_span:
            drop_capital_text = drop_capital_span.get_text(strip=True)
            
            # Check if the drop capital is followed by text in the same paragraph
            rest_of_paragraph = ''.join(
                child if isinstance(child, str) else child.get_text()
                for child in p.children
                if child != drop_capital_span
            )
            
            if rest_of_paragraph and not rest_of_paragraph[0].isspace():
                # If there's no space between the drop cap and the next text, concatenate them
                article_text += drop_capital_text + rest_of_paragraph[0]
                rest_of_paragraph = rest_of_paragraph[1:]  # Remove the first character, as it's already added
            else:
                # Otherwise, just add the drop capital with a space
                article_text += drop_capital_text + ' '
            
            # Add the remaining text from the paragraph
            article_text += rest_of_paragraph + ' '
        else:
            # Handle paragraphs without a drop cap
            for element in p.children:
                if isinstance(element, str):  # If the element is just text, add it
                    article_text += element.strip() + ' '
                elif element.name == 'a':  # If the element is a hyperlink
                    article_text += element.get_text(strip=True) + ' '
    
    # Clean up spaces before punctuation: remove space before punctuation marks, remove trailing space
    article_text = re.sub(r'\s([?.!,¿])', r'\1', article_text).strip()

    # Convert non-breaking spaces to regular spaces
    article_text = article_text.replace('\xa0', ' ')
    
    return article_text

In [55]:
# Now use the function to extract text from the article
articles_data = []

for idx, (title, link) in enumerate(all_articles, start=1):
        article_response = requests.get(link, headers=headers)
        
        if article_response.status_code == 200:
            article_soup = BeautifulSoup(article_response.text, 'html.parser')
            
            # Extract text
            article_text = get_article_text_with_links(article_soup)

            # Extract date; it appears in two tags/classes 
            date_element = article_soup.find('span', class_ = "dcr-u0h1qy")
            if not date_element:
                date_element = article_soup.find('div', class_='dcr-1pexjb9')
            date_text = date_element.get_text(strip=True).rsplit(" ", 1)[0] if date_element else None
            parsed_date = datetime.strptime(date_text, "%a %d %b %Y %H.%M") if date_text else None
            
            # Add to articles data
            articles_data.append({'title': title, 'link': link, 'date': parsed_date, 'text': article_text})
        
        else:
            # If the request fails, print an error message
            print(f"Failed to fetch article {idx}. Status code: {article_response.status_code}")

# Convert to DataFrame
guardian_relationships = pd.DataFrame(articles_data)

In [56]:
guardian_relationships.to_csv(output_dir / "guardian_relationships_articles.csv", index=False)

In [14]:
guardian_relationships[guardian_relationships['date'].isna()][['title', 'link']]

Unnamed: 0,title,link
296,Old Lesbians: reclaiming old age and queerness...,https://www.theguardian.com/world/ng-interacti...
781,Lover to lover: photographers’ most intimate i...,https://www.theguardian.com/artanddesign/galle...
1813,Heads together: the light of hopeful faces – i...,https://www.theguardian.com/books/gallery/2021...
1872,From Bob and Blanche to Kath and Kim: Australi...,https://www.theguardian.com/artanddesign/galle...
1919,An uncertain future for Japan’s love hotels – ...,https://www.theguardian.com/artanddesign/galle...
