In [1]:
# Base packages
import os
import numpy as np
import pandas as pd
import re
import requests

# Web scraping
from bs4 import BeautifulSoup

**Financial Times**

In [2]:
# FT response
ft_response = requests.get("https://www.ft.com/")
ft_soup = BeautifulSoup(ft_response.text, 'html.parser')
# Right data
"""
Tag: a
Class: js-teaser-heading-link
"""

ft_headlines_html = ft_soup.findAll("a", {"class": "js-teaser-heading-link"})
ft_headlines = [item.getText() for item in ft_headlines_html]
ft_headlines[:5]

['UN Security Council to meet over Israeli-Palestinian crisis',
 'The property developers still betting on London offices',
 'US banks could cut 200,000 jobs over next decade, top analyst says',
 'UK withholds backing for Joe Biden’s minimum global business tax',
 'Cairn Energy sues Air India in US over $1.2bn arbitration award']

**Reuters**

In [3]:
# Reuters response
reuters_response = requests.get("https://www.reuters.com/")
reuters_soup = BeautifulSoup(reuters_response.text, 'html.parser')

"""
Tag: span
Class: MediaStoryCard__title___2PHMeX
"""

reuters_headlines_html = reuters_soup.findAll("span", {"class": "MediaStoryCard__title___2PHMeX"})
reuters_headlines = [item.getText() for item in reuters_headlines_html]
reuters_headlines[:5]

['Israel air strikes kill 33 Palestinians, rockets fired from Gaza',
 'Myanmar anti-coup fighters retreat from town as U.S. makes appeal',
 'India’s pace of daily COVID-19 infections slows as more vaccines awaited',
 'Analysis: India’s once-in-a-century budget runs into trouble as virus strikes back',
 "Tesla crash victim lauded 'full self-driving' in videos on Tiktok"]

**Bloomberg**

In [4]:
# Bloomberg
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0',
    'From': 'marcus.aurelius@rome.com' 
}

bloomberg_response = requests.get("https://www.bloomberg.com/europe", headers=headers)

bloomberg_soup = BeautifulSoup(bloomberg_response.text, 'html.parser')

bloomberg_headlines_html_1 = bloomberg_soup.findAll("a", {"class": "single-story-module__headline-link"})
bloomberg_headlines_html_2 = bloomberg_soup.findAll("a", {"class": "story-package-module__story__headline-link"})

bloomberg_headlines_1 = [item.getText() for item in bloomberg_headlines_html_1]
bloomberg_headlines_2 = [item.getText() for item in bloomberg_headlines_html_2]

bloomberg_headlines = bloomberg_headlines_1 + bloomberg_headlines_2
bloomberg_headlines = [re.sub(r"(\s\s+)|(\\n)", "", item) for item in bloomberg_headlines]
bloomberg_headlines[:5]

['U.K.’s Sunak Isn’t Sold on Biden’s Global Business Tax Plan',
 'Some ESG Investors Turn to Emerging Markets, Defying Skeptics',
 'EU Covid Passport Due in June, UK Opening at Risk: Virus Update',
 'Bond Vigilantes Swarm European Economies Where Inflation Is Hot',
 'Israeli Strikes Kill 26, Topple Buildings in Gaza City']

## Unifying data sources

In [5]:
news_df = pd.DataFrame(columns=["Source", "Headline"])
bloomberg_df = pd.DataFrame({"Source": np.repeat("Bloomberg", len(bloomberg_headlines)),
                             "Headline": bloomberg_headlines})
ft_df = pd.DataFrame({"Source": np.repeat("Financial Times", len(ft_headlines)),
                      "Headline": ft_headlines})
reuters_df = pd.DataFrame({"Source": np.repeat("Reuters", len(reuters_headlines)),
                           "Headline": reuters_headlines})
for df in [bloomberg_df, ft_df, reuters_df]:
    news_df = news_df.append(df)
    
news_df.reset_index(drop=True, inplace=True)

In [6]:
# Displaying
news_df

Unnamed: 0,Source,Headline
0,Bloomberg,U.K.’s Sunak Isn’t Sold on Biden’s Global Busi...
1,Bloomberg,"Some ESG Investors Turn to Emerging Markets, D..."
2,Bloomberg,"EU Covid Passport Due in June, UK Opening at R..."
3,Bloomberg,Bond Vigilantes Swarm European Economies Where...
4,Bloomberg,"Israeli Strikes Kill 26, Topple Buildings in G..."
...,...,...
128,Reuters,Exxon under pressure as ISS backs Engine No. 1...
129,Reuters,Herd community? Elephants show us how to coexist
130,Reuters,Booming podcast industry comes of age with Amb...
131,Reuters,Black Panther figure joins London Madame Tussa...


In [7]:
lens = []
for item in news_df.Headline:
    lens.append(len(item.split()))
print(max(lens))

16


In [8]:
print(news_df['Headline'][0])

U.K.’s Sunak Isn’t Sold on Biden’s Global Business Tax Plan


## NLP model for sentiment analysis 

In [9]:
import config
import torch
from transformers import AutoModelForSequenceClassification

In [10]:
# Tokenizer and Model
tokenizer = config.TOKENIZER
model_path = config.BASE_MODEL_PATH
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Architecture
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [11]:
max_len = 16
batch_size = 32

input_ids = []
attn_masks = []

for (index, row) in news_df.iterrows():
    encoded_dict = tokenizer.encode_plus(row["Headline"],
                                         max_length=max_len,
                                         pad_to_max_length=True,
                                         truncation=True,
                                         return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attn_masks.append(encoded_dict['attention_mask'])
    
input_ids = torch.cat(input_ids, dim=0)
attn_masks = torch.cat(attn_masks, dim=0)



In [12]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attn_masks)

In [13]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

dataloader = DataLoader(
            dataset,  # The training samples.
            sampler = SequentialSampler(dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
            )

In [14]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x, axis=1)[:, None])
    return e_x / np.sum(e_x, axis=1)[:, None]

In [18]:
# Prediction code

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

print('Evaluating on {:,} test set batches...'.format(len(dataloader)))

# Predict 
for batch in dataloader:
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    
    # Telling the model not to compute or store the compute graph, saving memory 
    # and speeding up prediction
    with torch.no_grad():
        
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]
    
    # Store predictions and true labels
    predictions.append(logits)

flat_predictions = np.concatenate(predictions, axis=0)    
print('    DONE.')

Evaluating on 5 test set batches...
    DONE.


In [19]:
label_dict = {0: 'positive', 1: 'negative', 2: 'neutral'}

In [20]:
sentiment_indices = np.squeeze(np.argmax(softmax(flat_predictions), axis=1))
pd.Series([label_dict[item] for item in sentiment_indices]).value_counts()

neutral     80
negative    38
positive    15
dtype: int64

**Word Cloud or something similar**

- Maybe not a word cloud, a simple bar chart may be better visually

**Comment:**
Draft code is done, now need to refactor in move into source code files

In [27]:
# Seeing how it would look by exporting to json
out_df = news_df.copy()
out_df['Sentiment'] = pd.Series([label_dict[item] for item in sentiment_indices])
out_df.to_json('newsData.json')
pd.Series([label_dict[item] for item in sentiment_indices]).value_counts().to_json('newsSentiment.json')

In [44]:
dict(pd.read_json('newsSentiment.json', typ='series'))

{'neutral': 80, 'negative': 38, 'positive': 15}