In [1]:
#Please use a virtual environment for this notebook with a Python at least 3.11
%pip install pandas numpy tqdm dash spacy geopy sentence_transformers transformers torch==2.6 scikit-learn plotly pycountry geonamescache

Collecting dash
  Downloading dash-3.0.4-py3-none-any.whl.metadata (10 kB)
Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting geonamescache
  Downloading geonamescache-2.0.0-py3-none-any.whl.metadata (3.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-ma

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import dash
import spacy
import geopy
from sentence_transformers import SentenceTransformer
import transformers
import torch

import sklearn
import plotly.express as px

In [3]:
#Using Google Collab so we can use the Collab Pro subcription to run faster the calculation
from google.colab import drive
drive.mount('/content/drive')
PATH = "/content/drive/My Drive/Deeplearning/"

Mounted at /content/drive


In [4]:
# Reading and displaying headlines from Reuters, CNBC, and Guardian

df_reuters = pd.read_csv(PATH + "Financial News Headlines Data/reuters_headlines.csv")
df_cnbc = pd.read_csv(PATH + "Financial News Headlines Data/cnbc_headlines.csv")
df_guardian = pd.read_csv(PATH + "Financial News Headlines Data/guardian_headlines.csv")

print("Reuters Headlines:")
print(df_reuters.head())

print("\nCNBC Headlines:")
print(df_cnbc.head())

print("\nGuardian Headlines:")
print(df_guardian.head())

Reuters Headlines:
                                           Headlines         Time  \
0  TikTok considers London and other locations fo...  Jul 18 2020   
1  Disney cuts ad spending on Facebook amid growi...  Jul 18 2020   
2  Trail of missing Wirecard executive leads to B...  Jul 18 2020   
3  Twitter says attackers downloaded data from up...  Jul 18 2020   
4  U.S. Republicans seek liability protections as...  Jul 17 2020   

                                         Description  
0  TikTok has been in discussions with the UK gov...  
1  Walt Disney  has become the latest company to ...  
2  Former Wirecard  chief operating officer Jan M...  
3  Twitter Inc said on Saturday that hackers were...  
4  A battle in the U.S. Congress over a new coron...  

CNBC Headlines:
                                           Headlines  \
0  Jim Cramer: A better way to invest in the Covi...   
1     Cramer's lightning round: I would own Teradyne   
2                                                Na

In [6]:
# Convert date columns
df_reuters['date'] = pd.to_datetime(df_reuters['Time'], format='%b %d %Y', errors='coerce')
df_cnbc['date'] = pd.to_datetime(df_cnbc['Time'].str.extract(r'(\d{1,2} \w+ \d{4})')[0], format='%d %b %Y', errors='coerce')

# Guardian has no description only headlines so it is dropped → no need to parse its dates
# Add source column to each dataframe
df_reuters['source'] = 'Reuters'
df_cnbc['source'] = 'CNBC'

# Concatenate Reuters and CNBC only
df_combined = pd.concat([df_reuters, df_cnbc], ignore_index=True)

# Sort by date (ascending)
df_combined = df_combined.sort_values(by='date', ascending=True).reset_index(drop=True)

# Drop 'Time' column
df_combined.drop(columns=['Time'], inplace=True)

# Reorder columns
df_combined = df_combined[['date', 'Headlines', 'Description', 'source']]

# Save to CSV
df_combined.to_csv(PATH + 'Output/all_headlines.csv', index=False)

# Display the combined dataframe
print(f"\nCombined dataframe shape (Reuters + CNBC only): {df_combined.shape}")
print("\nCombined dataframe sample:")
print(df_combined.head())

# Check for missing values
print("\nMissing values per column:")
print(df_combined.isna().sum())

# Count articles by source
print("\nArticle counts by source:")
print(df_combined['source'].value_counts())

# Total number of articles
print(f"\nTotal number of articles: {len(df_combined)}")



Combined dataframe shape (Reuters + CNBC only): (35850, 4)

Combined dataframe sample:
        date                                          Headlines  \
0 2017-12-22  Cramer: Never buy a stock all at once — you'll...   
1 2017-12-22  Cramer: I helped investors through the 2010 fl...   
2 2017-12-22  Cramer says owning too many stocks and too lit...   
3 2017-12-26                       Markets lack Christmas cheer   
4 2017-12-27  S&P tends to start new year bullish after this...   

                                         Description source  
0  Jim Cramer doubled down on his key investing r...   CNBC  
1  Jim Cramer built on his "nobody ever made a di...   CNBC  
2  Jim Cramer broke down why owning fewer stocks ...   CNBC  
3  According to Kensho, here's how markets have f...   CNBC  
4  The S&P is on track to end the year up 20 perc...   CNBC  

Missing values per column:
date           1455
Headlines       280
Description     280
source            0
dtype: int64

Article counts 