## **Importing resources**

In [None]:
# Suppress FutureWarning messages to avoid cluttering the output
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Install the google-colab-selenium package to use Selenium within Google Colab
%pip install -q google-colab-selenium

In [None]:
import google_colab_selenium as gs      # for setting up Selenium in Google Colab

import urllib.request                   # for opening and reading URLs
import requests                         # to make HTTP requests for retrieving web content
from bs4 import BeautifulSoup           # for parsing HTML and extracting data
import pandas as pd                     # for data manipulation in DataFrames
import random

from datetime import date, datetime, timedelta               # to work with date objects
import time                             # for time-related functions like pausing execution
import sys                              # to manage system-specific parameters
import re                               # for working with regular expressions

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## **Mounting to Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
dir = '/content/gdrive/MyDrive/CSCI 199/Methodology/Datasets/Raw/'

## **Extract Source Function**

In [None]:
def extract_source(url):
    agent = {"User-Agent":"Chrome/105.0.0.0"}
    try:
      source=requests.get(url, headers=agent)
    except Exception as e:
        error_type, error_obj, error_info = sys.exc_info()
        print(f'ERROR FOR LINK: {url}')
        print(f'{error_type.__name__} occurred on Line {error_info.tb_lineno}: {e}')

    return source

# **News Scraping**

## **Inquirer's Bandera**

### **Site Scraper**

In [None]:
# Define the site and the target DataFrame
site = "Bandera"
banderaData = pd.DataFrame(columns=['Statement', 'Link', 'Date'])

# Set starting month and year
start_month = 5  # May
start_year = 2023

# Number of articles to scrape
total_articles_to_scrape = 5000
articles_scraped = 0  # Counter for scraped articles

# Starting page number
start_page = 41  # Start from page 41 as requested
current_page = start_page

# Create WebDriver instance
driver = gs.Chrome()

# Function to safely load pages with retry and delay logic
def safe_get(driver, url, retries=3):
    for attempt in range(retries):
        try:
            print(f"Attempt {attempt + 1}/{retries} for URL: {url}")
            driver.get(url)

            # Increase delay to slow down requests
            wait_time = random.uniform(20, 40)  # Wait 20-40 seconds randomly
            print(f"Waiting {wait_time:.2f} seconds before next request...")
            time.sleep(wait_time)
            return True  # Successfully loaded
        except Exception as e:
            wait_time = (2 ** attempt) + random.uniform(5, 15)  # Exponential backoff
            print(f"Retry {attempt + 1}/{retries} failed: {e}. Retrying in {wait_time:.2f} sec...")
            time.sleep(wait_time)

    print(f"Skipping {url} after {retries} failed attempts.")
    return False  # Indicate failure

try:
    while articles_scraped < total_articles_to_scrape:
        print(f'Processing page: {current_page}')
        url = f'https://bandera.inquirer.net/balita/page/{current_page}'
        print(f'URL: {url}')

        # Use safe_get instead of driver.get
        if not safe_get(driver, url):
            current_page += 1  # Skip to next page if it fails
            continue

        # Wait for articles to load
        try:
            WebDriverWait(driver, 120).until(EC.visibility_of_element_located((By.ID, 'landing-main-default')))
        except Exception as e:
            print(f"Timeout error on page {current_page}. Skipping to next page.")
            current_page += 1
            continue  # Skip this page and move on

        # Get the page source and parse it
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')

        # Extract all articles
        articles = soup.find('div', {'id': 'landing-main-default'}).find_all('div', {'id': 'gallery-box'})
        print(f'Found {len(articles)} articles on page {current_page}.')

        if not articles:
            print(f"No articles found on Page {current_page}. Stopping.")
            break  # Stop if no more articles are found

        rows = []
        for article in articles:
            if articles_scraped >= total_articles_to_scrape:
                print("Reached target article count. Stopping.")
                break

            try:
                # Extract headline, link, and date
                Statement = article.find('div', {'id': 'story-info'}).find('div', {'id': 'headline'}).find('h2').text.strip()
                Link = article.find('a')['href'].strip()
                Date_str = article.find('div', {'id': 'story-info'}).find('div', {'id': 'headline'}).find('div', {'id': 'pdate'}).text.strip()

                # Parse the article date
                article_date = datetime.strptime(Date_str, "%B %d, %Y")

                # Only include articles on or before the target date
                if article_date.year > start_year or (article_date.year == start_year and article_date.month > start_month):
                    print(f"Skipping future article: {Statement}, Date: {Date_str}")
                    continue

                print(f"Scraping article: {Statement}, Date: {Date_str}")
                rows.append({'Statement': Statement, 'Link': Link, 'Date': Date_str})
                articles_scraped += 1

            except Exception as e:
                error_type, error_obj, error_info = sys.exc_info()
                print(f'ERROR FOR LINK: {url}')
                print(f'{error_type.__name__} occurred on Line {error_info.tb_lineno}: {e}')

        # If articles were collected, add them to the DataFrame
        if rows:
            rows_df = pd.DataFrame(rows)
            banderaData = pd.concat([banderaData, rows_df], ignore_index=True)

        # Move to the next page
        current_page += 1

except Exception as e:
    error_type, error_obj, error_info = sys.exc_info()
    print(f'ERROR: {error_type.__name__} occurred on Line {error_info.tb_lineno}: {e}')

finally:
    driver.quit()

# Drop duplicates and reset index
banderaData.drop_duplicates(inplace=True)
banderaData.reset_index(drop=True, inplace=True)

# Save the file
filename = f"{dir}{site}_{datetime.today().strftime('%Y-%m-%d')}_NEWS_LinkList.csv"
banderaData.to_csv(filename, index=False)

# Display the first few rows of the data
print(banderaData.head())



<IPython.core.display.Javascript object>

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Scraping article: Ospital ng Sampaloc, binigyan ng 976 Avigan tablets ng DOH, Date: August 30, 2020
Processing page: 149
URL: https://bandera.inquirer.net/balita/page/149
Attempt 1/3 for URL: https://bandera.inquirer.net/balita/page/149
Waiting 36.02 seconds before next request...
Found 13 articles on page 149.
Scraping article: Pangulong Duterte, may virtual conference sa King of Jordan, Date: August 30, 2020
Scraping article: Repatriated Filipinos, umabot na sa 153,124, Date: August 30, 2020
Scraping article: P680,000 halaga ng shabu, nakumpiska sa Cebu, Date: August 30, 2020
Scraping article: 28 pang Filipino abroad, gumaling sa COVID-19, Date: August 29, 2020
Scraping article: COVID-19 cases sa Pilipinas, 213,131 na, Date: August 29, 2020
Scraping article: Global death toll dahil sa COVID-19, higit 841,000 na, Date: August 29, 2020
Scraping article: Cabangan, Zambales niyanig ng lindol, Date: August 29, 2020
Scraping 

### **Article Scraper**

In [None]:
# Test article scraper. Working.

url = 'https://bandera.inquirer.net/393899/lpa-posibleng-mabuo-sa-bahagi-ng-mindanao-magpapaulan-sa-weekend'

soup = BeautifulSoup(extract_source(url).text, 'html.parser')

Title = soup.find('div',{'id':'landing-headline'}).find('h1').text.strip()
print(Title)

Author = ' '.join(soup.find('div', {'id':'m-pd2'}).find_all('span')[0].text.split(' ')[:-4]).strip()
print(Author)

Date = ' '.join(soup.find('div', {'id':'m-pd2'}).find_all('span')[1].text.split(' ')[:-4]).strip()
print(Date)

p_tags = soup.find('div', {'id':'article-content'}).find_all('p')
body = ""
for p in p_tags:
    body += (p.text) +"\n"
print(body)

LPA posibleng mabuo sa bahagi ng Mindanao, magpapaulan sa weekend
Pauline del Rosario
August 30,
PHOTO: Facebook/Dost_pagasa

PHOTO: Facebook/Dost_pagasa
KASALUKUYANG binabantayan ang “cloud clusters” o kumpul-kumpol na mga ulap na nasa silangan ng Mindanao.
Ayon kasi sa Philippine Atmospheric, Geophysical and Astronomical Services Administration (PAGASA), posible itong maging Low Pressure Area (LPA) at pumasok ng ating bansa.
“Hindi natin inaalis ang tiyansa na maging Low Pressure Area ito sa loob ng 24 oras at pumasok din ng ating PAR (Philippine Area of Responsibility),” sey ni PAGASA weather specialist Benison Estareja sa update kaninang umaga, August 30.
At dahil din diyan ay pwede itong maging sanhi ng mga ulan ngayong weekend.
“At possible, sa mga unang araw Setyembre, ay magdala ito ng mga pag-ulan sa ating bansa, lalo na sa Southern Luzon, Visayas and Mindanao,” dagdag ni Estareja.
Baka Bet Mo: JM nasa cloud 9 pa rin sa YES ni Donnalyn: I love you baby, palagi!
 
 
A post shar

In [None]:
def banderaArticleScraper(url):
    try:
        soup = BeautifulSoup(extract_source(url).text, 'html.parser')
        row = []

        # Extract Title
        Title = soup.find('div', {'id': 'landing-headline'}).find('h1').text.strip()

        # Extract Author
        try:
            author_div = soup.find('div', {'id': 'm-pd2'})
            if author_div:
                author_spans = author_div.find_all('span')
                if len(author_spans) > 0:
                    Author = ' '.join(author_spans[0].text.split(' ')[:-4]).strip()
                else:
                    Author = ''
            else:
                Author = ''
        except Exception as e:
            error_type, error_obj, error_info = sys.exc_info()
            print('ERROR FOR LINK:', url)
            print(error_type, 'Line:', error_info.tb_lineno)
            Author = ""

        # Extract Date
        try:
            date_div = soup.find('div', {'id': 'm-pd2'})
            if date_div:
                date_spans = date_div.find_all('span')
                if len(date_spans) > 1:
                    Date = ' '.join(date_spans[1].text.split(' ')[:-4]).strip()
                else:
                    Date = ''
            else:
                Date = ''
        except Exception as e:
            error_type, error_obj, error_info = sys.exc_info()
            print('ERROR FOR LINK:', url)
            print(error_type, 'Line:', error_info.tb_lineno)
            Date = ""

        # Extract Body
        try:
            p_tags = soup.find('div', {'id': 'article-content'}).find_all('p')
            body = ""
            for p in p_tags:
                body += (p.text) + "\n"
        except Exception as e:
            error_type, error_obj, error_info = sys.exc_info()
            print('ERROR FOR LINK:', url)
            print(error_type, 'Line:', error_info.tb_lineno)
            body = ""

        row.extend((Title, Author, Date, body))

    except Exception as e:
        error_type, error_obj, error_info = sys.exc_info()
        print(f'ERROR FOR LINK: {url}')
        print(f'{error_type.__name__} occurred on Line {error_info.tb_lineno}: {e}')
        row = [None, None, None, None]  # Return a row with None values in case of a major error

    return row

site = "Bandera"
filename= f"{dir+site}_{date.today()}NEWS_duplicate.csv"

df = pd.read_csv(f'{dir}{site}_2025-01-31_NEWS_LinkList.csv')
rows_list = []

for index, r in df.iterrows():
  url = r["Link"]
  print(index, url)
  row = banderaArticleScraper(url)
  rows_list.append(row)

banderaData2 = pd.DataFrame(rows_list, columns=["Title", "Author", "Date", "Text"])

# Save file
banderaData2.to_csv(filename)
banderaData2.head()

0 https://bandera.inquirer.net/351128/bagyong-betty-nakapasok-na-sa-bansa-signal-no-1-posibleng-itaas-sa-northern-luzon-pagasa
1 https://bandera.inquirer.net/351086/super-typhoon-lalo-pang-lalakas-habang-papalapit-ng-bansa-pagasa
3 https://bandera.inquirer.net/350685/7-sugatan-p300m-halaga-ng-pinsala-sa-nasunog-na-manila-central-post-office-bfp
4 https://bandera.inquirer.net/350576/bsp-nagbabala-sa-modus-na-sangla-atm-mga-mga-cardholder-posibleng-maloko-sa-withdrawal
5 https://bandera.inquirer.net/350539/pagasa-ang-pagpasok-ng-bagyo-ay-posibleng-hudyat-ng-pagsisimula-ng-panahon-ng-tag-ulan
6 https://bandera.inquirer.net/350406/bagyo-posibleng-pumasok-ng-bansa-sa-mga-susunod-na-araw-tag-ulan-malapit-na-pagasa
7 https://bandera.inquirer.net/350198/pagsusuot-ng-face-mask-required-na-ulit-sa-baguio-city
8 https://bandera.inquirer.net/350188/sanggol-patay-matapos-banlian-ng-kumukulong-tubig-suspek-arestado
9 https://bandera.inquirer.net/349936/pagasa-heat-index-sa-13-na-lugar-umabot-na-sa-d

Unnamed: 0,Title,Author,Date,Text
0,"Bagyong Betty nakapasok na sa bansa, ‘Signal n...",Pauline del Rosario,"May 27,",PHOTO: Facebook/Dost_pagasa\n\nPHOTO: Facebook...
1,Super Typhoon lalo pang lalakas habang papalap...,Pauline del Rosario,"May 26,",PHOTO: Facebook/Dost_pagasa\n\nPHOTO: Facebook...
2,"Super Typhoon papasok na sa Biyernes o Sabado,...",Pauline del Rosario,"May 25,",PHOTO: Facebook/Dost_pagasa\n\nPHOTO: Facebook...
3,"7 sugatan, P300M halaga ng pinsala sa nasunog ...",Pauline del Rosario,"May 22,",INQUIRER photo\n\nINQUIRER photo\nPITO ang nai...
4,"BSP nagbabala sa modus na ‘sangla-ATM’, mga mg...",Pauline del Rosario,"May 21,",INQUIRER file photo\n\nINQUIRER file photo\nNA...
