# 2.2 Clean Article Data
This notebook cleans the article data in preparation for modeling. The ultimate goal is to match cleaned articles to the products based on their provided data.

In [3]:
import json
import pandas as pd
import re
import logging
from datetime import datetime

#### Set up logging

In [4]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

#### Load data

In [5]:
def load_raw_data():
    with open('./intermediate_data/Scraped_Article_Raw_Data.json', 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    logging.info(f"Loaded {len(raw_data)} articles from file")
    return raw_data

#### Remove unwanted information from text

In [6]:
def clean_text(text):
    
    if not text or pd.isna(text):
        return ""
    text = str(text)  # Convert to string
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace and normalize
    text = re.sub(r'\S+@\S+', '', text) # Remove email addresses
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Remove common web artifacts
    text = re.sub(r'Cookie Policy|Privacy Policy|Terms of Service', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Subscribe|Newsletter|Advertisement', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Share this article|Follow us on', '', text, flags=re.IGNORECASE)
    # Remove excessive punctuation
    text = re.sub(r'[.]{3,}', '...', text)
    text = re.sub(r'[-]{3,}', '---', text)
    
    return text.strip()

#### Clean all fields of an article

In [7]:
def clean_article_data(article):
    cleaned_article = {}
    cleaned_article['title'] = clean_text(article.get('title', ''))
    cleaned_article['source'] = clean_text(article.get('source', ''))
    cleaned_article['date'] = article.get('date', '')
    cleaned_article['link'] = article.get('link', '')
    cleaned_article['text'] = clean_text(article.get('text', ''))
    return cleaned_article

#### Clean all article in the dataset

In [8]:
def clean_article_dataset(raw_data):
    
    if not raw_data:
        logging.error("No data provided for cleaning")
        return None
    cleaned_articles = []
    for i, article in enumerate(raw_data):
        try:
            cleaned_article = clean_article_data(article)
            cleaned_articles.append(cleaned_article)
            if (i + 1) % 50 == 0:
                logging.info(f"Cleaned {i + 1}/{len(raw_data)} articles") 
        except Exception as e:
            logging.error(f"Error cleaning article {i + 1}: {str(e)}")
            continue
    
    return cleaned_articles

### Clean and save articles data

In [9]:
raw_article_data = load_raw_data()
cleaned_article_data = clean_article_dataset(raw_article_data)
filename = './intermediate_data/Cleaned_Article_Data.json'
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(cleaned_article_data, f, indent=2, ensure_ascii=False)


2025-06-11 15:01:15,619 - INFO - Loaded 30 articles from file
