In [2]:
import json, os, re

from bs4 import BeautifulSoup
import numpy as np
import requests
from requests.models import MissingSchema
import trafilatura
import spacy

In [3]:
# Define web page url
url = 'https://www.theguardian.com/uk-news/2023/sep/08/what-a-year-of-king-charles-has-shown-us-about-how-he-wants-to-reign'

In [4]:
# Define Beautifulsoup fallback function for cases when Trafilatura is unable to extract text
def extract_text_fallback(response_content):
    
    # Create the beautifulsoup object
    soup = BeautifulSoup(response_content, 'html.parser')
    
    # Find the text:
    text = soup.find_all(text=True)
    
    # Remove unwanted tag elements
    cleaned_text = ''
    unwanted_tags = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head', 
        'input',
        'script',
        'style',]

    # Extract text, ignoring unwanted tags
    for item in text:
        if item.parent.name not in unwanted_tags:
            cleaned_text += '{} '.format(item)
            
    # Remove any tab separation and strip the text
    cleaned_text = cleaned_text.replace('\t', '')
    return cleaned_text.strip()
    
# Define extract function, Trafilatura with callback function
def extract_text(url):
    
    downloaded_url = trafilatura.fetch_url(url)
    try:
        extracted_content = trafilatura.extract(downloaded_url, output_format='json', with_metadata=True, include_comments = False,
                            date_extraction_params={'extensive_search': True, 'original_date': True})
    except AttributeError:
        extracted_content = trafilatura.extract(downloaded_url, output_format='json', with_metadata=True,
                            date_extraction_params={'extensive_search': True, 'original_date': True})
    if extracted_content:
        json_output = json.loads(extracted_content)
        return json_output['text']
    else:
        try:
            # Obtain the response
            resp = requests.get(url)
            # Check if the response status is 200 - Status OK, collect HTML Content
            if resp.status_code == 200:
                return extract_text_fallback(resp.content)
            else:
                # If both Trafilature and BeautifulSoup functions fail
                return np.nan
        # Handle any URLs that don't have the correct protocol
        except MissingSchema:
            return np.nan


In [5]:
# Extract parsed text
final_article_text = extract_text(url)

print(final_article_text)

As the king spends the first anniversary of his mother’s death, and of his accession, at his Scottish highland retreat on Friday, he may reflect back on a year of historic transition not seen in 70 years.
Suggestions that Charles, 74, sees himself as a “caretaker king”, keeping the throne warm for the new Prince of Wales, who will be the real reformer, is not a scenario recognised at Buckingham Palace.
There have been no eye-catching reforms in his first year, and he can be fairly described as the “cautious” king. But there are clues to his aspirations in the small changes.
He has turbo-charged royal receptions, harnessing their soft-power to the maximum, sources point out, in the knowledge he can no longer speak out publicly on subjects he remains passionate about.
So, the “convening” king is a label it seems he will accept.
When advised by Liz Truss’s government not to attend the Cop27 climate conference in Egypt, he instead hosted an eve of Cop27 reception for international represen

In [None]:
# Convert text into spacy tokens doc
nlp = spacy.load("en_core_web_sm")
doc = nlp(final_article_text)
print(doc)

In [7]:
# Check the word count per text document
print(f"The estimated word count for this document is: {len(doc)}.\n")
# Check the number of sentences
print(f"The estimated number of sentences in the document is: {len(list(doc.sents))}.")

The estimated word count for this document is: 1795.

The estimated number of sentences in the document is: 68.


In [8]:
# Extract article's title from the first sentence
for sent_i in enumerate(doc.sents):
  sent_i
first_sentence = str(f"{list(doc.sents)[0]}")
first_sentence = first_sentence.replace(' ','_')
first_sentence = re.sub('\n.*','',first_sentence)
article_first_sentence = re.sub('[?!\\/:*|\"\'<>]','',first_sentence).split(',')[0]
print(f"The article's first sentence is: {article_first_sentence}")


As_the_king_spends_the_first_anniversary_of_his_mother’s_death


In [9]:
# Extract article's title from tag

## Obtain the response
resp = requests.get(url)
## Get Content
soup = BeautifulSoup(resp.content, 'html.parser')
## Get title
title = soup.find('title').string.split('|')[0]
article_title = title.replace(' ','_')
print(f"The article's title is: {article_title}")

What_a_year_of_King_Charles_has_shown_us_about_how_he_wants_to_reign_


In [10]:
# Check working directory
os.getcwd()

'c:\\GitHub\\Article_Analyzer_NLP'

In [11]:
# Save scraped article to file

## Name with title
file = open(f"{article_title}.txt",'w', encoding="utf-8")
file.write(final_article_text)
file.close()

# ## Name with the first sentence
# file = open(f"{article_first_sentence}.txt",'w')
# file.write(final_article_text)
# file.close()