## 1.2 Collect Articles
Next, we collect recent articles by scraping different sites such as EDN Network, EE Times, Electronic Design, Electronics Weekly. Site links can be found in ./data/Artical_Links.csv 

In [20]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
from datetime import datetime, timedelta
import time
import re
from urllib.parse import urljoin, urlparse
import os
import hashlib
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [21]:
def init_scraper():
    state = {}

    state["headers"] = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
    }
    state["cutoff_date"] = datetime(2025, 4, 1)
    state["articles"] = []

    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument(f'--user-agent={state["headers"]["User-Agent"]}')
    state["chrome_options"] = chrome_options

    os.makedirs('./intermediate_data', exist_ok=True)

    return state


In [22]:
def get_selenium_driver(state):
#"""Initialize Selenium Chrome driver"""
    try:
        driver = webdriver.Chrome(options=state["chrome_options"])
        return driver
    except Exception as e:
        print(f"Failed to initialize Chrome driver: {e}")
        return None

#### Request for pages from sites using selenium

In [23]:
def get_page_source_selenium(state, url, wait_time=10):
#Get page source using Selenium
    driver = get_selenium_driver(state)
    if not driver:
        return None

    try:
        driver.get(url)
        WebDriverWait(driver, wait_time).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        page_source = driver.page_source
        driver.quit()
        return page_source
    except Exception as e:
        print(f"Selenium failed for {url}: {e}")
        if driver:
            driver.quit()
        return None

#### Request for pages from sites using retries

In [24]:
def get_page_requests(state, url, retries=3):

    for attempt in range(retries):
        try:
            session = requests.Session()
            session.headers.update(state["headers"])
            response = session.get(url, timeout=30)
            response.raise_for_status()
            return response.text
        except Exception as e:
            if attempt == retries - 1:
                print(f"Requests failed for {url}: {e}")
                return None
            time.sleep(2)
    return None

#### Extract page content

In [25]:

def get_page_content(state, url):
    content = get_page_source_selenium(state,url)
    if not content:
        content = get_page_requests(state,url)
    return content


#### Enhanced date parsing (Non-RSS)

In [26]:
def parse_date(date_str):

    if not date_str:
        return None

    # Clean the date string
    date_str = re.sub(r'Posted on|Modified on|Published|By.*', '', date_str, flags=re.IGNORECASE)
    date_str = date_str.strip()

    # Common date patterns
    patterns = [
        r'(\d{1,2})(?:st|nd|rd|th)?\s+(\w+)\s+(\d{4})',  # 3rd June 2025
        r'(\w+)\s+(\d{1,2}),?\s+(\d{4})',  # June 3, 2025
        r'(\d{4})-(\d{2})-(\d{2})',  # 2025-06-03
        r'(\d{2})/(\d{2})/(\d{4})',  # 06/03/2025
        r'(\d{1,2})\s+(\w+)\s+(\d{4})',  # 3 June 2025
        r'(\w{3})\s+(\d{1,2}),?\s+(\d{4})',  # Jun 3, 2025
    ]

    months = {
        'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
        'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12,
        'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
        'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
    }

    date_str_lower = date_str.lower()

    for pattern in patterns:
        match = re.search(pattern, date_str_lower)
        if match:
            try:
                groups = match.groups()
                if len(groups) == 3:
                    if groups[0].isdigit() and groups[2].isdigit():  # Day Month Year
                        day, month, year = int(groups[0]), groups[1], int(groups[2])
                        if month in months:
                            return datetime(year, months[month], day)
                    elif groups[1].isdigit():  # Month Day Year
                        month, day, year = groups[0], int(groups[1]), int(groups[2])
                        if month in months:
                            return datetime(year, months[month], day)
                    elif '-' in date_str or '/' in date_str:
                        if len(groups[0]) == 4:  # YYYY-MM-DD
                            return datetime(int(groups[0]), int(groups[1]), int(groups[2]))
                        else:  # MM/DD/YYYY
                            return datetime(int(groups[2]), int(groups[0]), int(groups[1]))
            except:
                continue
    return None

#### Scrape EE Times

In [27]:
def scrape_eetimes(state):

    articles = []
    # RSS feed
    rss_urls = [
        "https://www.eetimes.com/feed/",
        "https://www.eetimes.com/rss/"
    ]

    for rss_url in rss_urls:
        try:
            response = requests.get(rss_url, headers=state["headers"], timeout=30)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'xml')
                items = soup.find_all('item')
                
                for item in items:
                    try:
                        title = item.find('title').text.strip()
                        link = item.find('link').text.strip()
                        pub_date = item.find('pubDate')
                        
                        if pub_date:
                            date_str = pub_date.text
                            # Parse RSS date format: Wed, 03 Jun 2025 10:00:00 GMT
                            article_date = datetime.strptime(date_str.split(',')[1].strip()[:11], '%d %b %Y')
                            
                            if article_date > state["cutoff_date"]:
                                articles.append({
                                    'title': title,
                                    'url': link,
                                    'date': article_date.strftime('%Y-%m-%d'),
                                    'source': 'EE Times'
                                })
                    except Exception:
                        continue
                break
        except Exception:
            continue

# Direct scraping
        if not articles:
            content = get_page_content(state,"https://www.eetimes.com")
            if content:
                soup = BeautifulSoup(content, 'html.parser')
                
                # Look for article links
                selectors = [
                    'article .entry-title a', 'h2.entry-title a', 
                    '.post-title a', 'h3 a', '.river-block h3 a'
                ]
                for selector in selectors:
                    links = soup.select(selector)
                    for link in links:
                        href = link.get('href')
                        title = link.get_text(strip=True)
                        
                        if href and title and 'eetimes.com' in href:
                            articles.append({
                                'title': title,
                                'url': href,
                                'date': datetime.now().strftime('%Y-%m-%d'),  # Fallback date
                                'source': 'EE Times'
                            })
                    if links:  # If we found articles with this selector, break
                        break
    return articles[:20]  # Limit to 20 most recent articles

#### Scrape Electronics Weekly

In [28]:
def scrape_electronics_weekly(state):

    articles = []
    #RSS first
    try:
        response = requests.get("https://www.electronicsweekly.com/feed/", 
                                headers=state["headers"], timeout=30)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'xml')
            items = soup.find_all('item')
            
            for item in items:
                try:
                    title = item.find('title').text.strip()
                    link = item.find('link').text.strip()
                    pub_date = item.find('pubDate')
                    
                    if pub_date:
                        date_str = pub_date.text
                        article_date = datetime.strptime(date_str.split(',')[1].strip()[:11], '%d %b %Y')
                        
                        if article_date > state["cutoff_date"]:
                            articles.append({
                                'title': title,
                                'url': link,
                                'date': article_date.strftime('%Y-%m-%d'),
                                'source': 'Electronics Weekly'
                            })
                except Exception:
                    continue
    except Exception:
        # Fallback to direct scraping
        content = get_page_content(state, "https://www.electronicsweekly.com")
        if content:
            soup = BeautifulSoup(content, 'html.parser')
            
            article_links = soup.select('article h2 a, .entry-title a')
            for link in article_links:
                href = link.get('href')
                title = link.get_text(strip=True)
                
                if href and title:
                    full_url = urljoin("https://www.electronicsweekly.com", href)
                    articles.append({
                        'title': title,
                        'url': full_url,
                        'date': datetime.now().strftime('%Y-%m-%d'),
                        'source': 'Electronics Weekly'
                    })

    return articles[:20]

#### Scrape EDN

In [29]:
def scrape_edn_network(state):
    
    articles = []
    #RSS feed
    try:
        response = requests.get("https://www.edn.com/feed/", 
                                headers=state["headers"], timeout=30)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'xml')
            items = soup.find_all('item')
            
            for item in items:
                try:
                    title = item.find('title').text.strip()
                    link = item.find('link').text.strip()
                    pub_date = item.find('pubDate')
                    
                    if pub_date:
                        date_str = pub_date.text
                        article_date = datetime.strptime(date_str.split(',')[1].strip()[:11], '%d %b %Y')
                        
                        if article_date > state["cutoff_date"]:
                            articles.append({
                                'title': title,
                                'url': link,
                                'date': article_date.strftime('%Y-%m-%d'),
                                'source': 'EDN Network'
                            })
                except Exception:
                    continue
    except Exception:
        # Fallback to direct scraping
        content = get_page_content(state, "https://www.edn.com")
        if content:
            soup = BeautifulSoup(content, 'html.parser')
            
            article_links = soup.select('.river-block h3 a, article h2 a')
            for link in article_links:
                href = link.get('href')
                title = link.get_text(strip=True)
                
                if href and title:
                    full_url = urljoin("https://www.edn.com", href)
                    articles.append({
                        'title': title,
                        'url': full_url,
                        'date': datetime.now().strftime('%Y-%m-%d'),
                        'source': 'EDN Network'
                    })

    return articles[:20]

#### Scrape Electronic Design

In [30]:

def scrape_electronic_design(state):

    articles = []
    # RSS feed
    try:
        response = requests.get("https://www.electronicdesign.com/rss.xml", 
                                headers=state["headers"], timeout=30)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'xml')
            items = soup.find_all('item')
            
            for item in items:
                try:
                    title = item.find('title').text.strip()
                    link = item.find('link').text.strip()
                    pub_date = item.find('pubDate')
                    
                    if pub_date:
                        date_str = pub_date.text
                        article_date = datetime.strptime(date_str.split(',')[1].strip()[:11], '%d %b %Y')
                        
                        if article_date > state["cutoff_date"]:
                            articles.append({
                                'title': title,
                                'url': link,
                                'date': article_date.strftime('%Y-%m-%d'),
                                'source': 'Electronic Design'
                            })
                except Exception:
                    continue
    except Exception:
        # Fallback to direct scraping
        content = get_page_content(state, "https://www.electronicdesign.com")
        if content:
            soup = BeautifulSoup(content, 'html.parser')
            
            article_links = soup.select('article h2 a, .post-title a')
            for link in article_links:
                href = link.get('href')
                title = link.get_text(strip=True)
                
                if href and title:
                    full_url = urljoin("https://www.electronicdesign.com", href)
                    articles.append({
                        'title': title,
                        'url': full_url,
                        'date': datetime.now().strftime('%Y-%m-%d'),
                        'source': 'Electronic Design'
                    })

    return articles[:20]


#### Function to scrape all sites

In [31]:
def scrape_all_sites():
    
    state = init_scraper()
    all_articles = []

    sites = [
        ("EE Times", scrape_eetimes),
        ("Electronics Weekly",scrape_electronics_weekly),
        ("EDN Network", scrape_edn_network),
        ("Electronic Design", scrape_electronic_design)
    ]

    for site_name, scrape_func in sites:
        try:
            articles = scrape_func(state)
            all_articles.extend(articles)
        except Exception as e:
            print(f"Failed to scrape {site_name}: {e}")
        time.sleep(2)  # Rate limiting between sites

    # Remove duplicates
    seen_urls = set()
    unique_articles = []
    for article in all_articles:
        if article['url'] not in seen_urls:
            unique_articles.append(article)
            seen_urls.add(article['url'])
    # Sort by date (newest first)
    unique_articles.sort(key=lambda x: x['date'], reverse=True)

# Return the list
    return unique_articles

#### Save Relevant Article Links

In [32]:
articles = scrape_all_sites()
df = pd.DataFrame(articles)
df.to_csv('./intermediate_data/Scraped_Article_Links.csv', index=False)

#### Scrape articles based on keywords

In [33]:
def scrape_articles_with_keywords(state, keywords):

    all_articles = []

    scraping_functions = [
        scrape_eetimes,
        scrape_electronics_weekly,
        scrape_edn_network,
        scrape_electronic_design
    ]

    for scraper in scraping_functions:
        try:
            articles = scraper(state)
            for article in articles:
                try:
                    article_text = f"{article['title']}"
                    # Optionally fetch content for deeper match
                    content = get_page_content(state, article['url'])
                    if content:
                        soup = BeautifulSoup(content, 'html.parser')
                        body_text = soup.get_text(separator=' ', strip=True)
                        article_text += " " + body_text.lower()

                    if any(keyword in article_text.lower() for keyword in keywords):
                        all_articles.append(article)

                except Exception as e:
                    print(f"Error processing article: {e}")
                    continue

        except Exception as e:
            print(f"Failed to scrape site: {e}")
            continue

        time.sleep(1)

    return all_articles


#### Trade relevant articles

In [38]:
state = init_scraper()
tarrif_keywords = ["tariff", "trade war", "export", "import duty"]
tarrif_articles = scrape_articles_with_keywords(state, tarrif_keywords)

df = pd.DataFrame(tarrif_articles)
df.to_csv('./intermediate_data/Tarrif_Articles.csv', index=False)

#### Supply relevant articles

In [39]:
supply_keywords = ["shortage", "supply chain", "distribution", "logistics"]
supply_articles = scrape_articles_with_keywords(state, supply_keywords)

df = pd.DataFrame(supply_articles)
df.to_csv('./intermediate_data/Supply_Articles.csv', index=False)