## **Importing resources**

In [None]:
# Suppress FutureWarning messages to avoid cluttering the output
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Install the google-colab-selenium package to use Selenium within Google Colab
%pip install -q google-colab-selenium

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.3/486.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import google_colab_selenium as gs      # for setting up Selenium in Google Colab

import urllib.request                   # for opening and reading URLs
import requests                         # to make HTTP requests for retrieving web content
from bs4 import BeautifulSoup           # for parsing HTML and extracting data
import pandas as pd                     # for data manipulation in DataFrames

from datetime import date, datetime, timedelta               # to work with date objects
import time                             # for time-related functions like pausing execution
import sys                              # to manage system-specific parameters
import re                               # for working with regular expressions

import random

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## **Mounting to Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
dir = '/content/gdrive/MyDrive/CSCI 199/Methodology/Datasets/Raw/'

## **Extract Source Function**

In [None]:
# def extract_source(url):
#     agent = {"User-Agent":"Chrome/105.0.0.0"}
#     try:
#       source=requests.get(url, headers=agent)
#     except Exception as e:
#         error_type, error_obj, error_info = sys.exc_info()
#         print(f'ERROR FOR LINK: {url}')
#         print(f'{error_type.__name__} occurred on Line {error_info.tb_lineno}: {e}')

#     return source

# Custom function for Balita
def extract_source(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)  # Add custom headers and timeout
        response.raise_for_status()  # Raise HTTP errors if any
        return response
    except requests.exceptions.HTTPError as e:
        print(f"HTTPError for URL: {url} -> {e}")
        return None
    except Exception as e:
        print(f"Error occurred for URL: {url} -> {e}")
        return None


# **News Scraping**

## **Manila Bulletin's Balita**

### **Site Scraper**

In [None]:
# Define the site and the target DataFrame
site = "Balita"
balitaData = pd.DataFrame(columns=['Statement', 'Link', 'Date'])

# Number of articles to scrape
total_articles_to_scrape = 5000
articles_scraped = 0  # Counter for scraped articles

# Start scraping from a specific page
start_page = 1228
current_page = start_page

# Create WebDriver instance
driver = gs.Chrome()

# Function to safely load pages with retry and delay logic
def safe_get(driver, url, retries=3):
    for attempt in range(retries):
        try:
            print(f"Attempt {attempt + 1}/{retries} for URL: {url}")
            driver.get(url)

            # Increase delay to slow down requests
            wait_time = random.uniform(20, 40)  # Wait 5-15 seconds randomly
            print(f"Waiting {wait_time:.2f} seconds before next request...")
            time.sleep(wait_time)

            return True  # Successfully loaded
        except Exception as e:
            wait_time = (2 ** attempt) + random.uniform(5, 15)  # Exponential backoff
            print(f"Retry {attempt + 1}/{retries} failed: {e}. Retrying in {wait_time:.2f} sec...")
            time.sleep(wait_time)

    print(f"Skipping {url} after {retries} failed attempts.")
    return False  # Indicate failure

try:
    while articles_scraped < total_articles_to_scrape:
        print(f'Processing page: {current_page}')
        url = f'https://balita.mb.com.ph/morearticles/balita/?pgno={current_page}'
        print(f'URL: {url}')

        # Use safe_get instead of driver.get
        if not safe_get(driver, url):
            current_page += 1  # Skip to next page if it fails
            continue

        # Wait for articles to load
        try:
          WebDriverWait(driver, 120).until(EC.visibility_of_element_located((By.CLASS_NAME, 'item')))
        except Exception as e:
          print(f"Timeout error on page {current_page}. Skipping to next page.")
          current_page += 1
          continue  # Skip this page and move on

        # Get the page source and parse it
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')

        # Extract all articles
        articles = soup.find('div', {'class': 'balita-load-more-articles'}).find_all('div', {'class': 'item'})
        print(f'Found {len(articles)} articles on page {current_page}.')

        if not articles:
            print(f"No articles found on Page {current_page}. Stopping.")
            break  # Stop if no more articles are found

        rows = []
        for article in articles:
            if articles_scraped >= total_articles_to_scrape:
                print("Reached target article count. Stopping.")
                break

            try:
                # Extract headline, link, and date
                Statement = article.find('div', {'class': 'item-content'}).find(
                    'a', {'class': 'item-title ellipsis-2 title-d'}).find('p').text.strip()
                Link = article.find('a', {'class': 'item-image'})['href'].strip()

                # Extract date from the <div class="date-item">
                date_element = article.find('div', {'class': 'date-item'})
                article_date = date_element.text.strip() if date_element else ""

                print(f"Scraping article: {Statement}, Date: {article_date}")
                rows.append({'Statement': Statement, 'Link': Link, 'Date': article_date})
                articles_scraped += 1

            except Exception as e:
                error_type, error_obj, error_info = sys.exc_info()
                print(f'ERROR FOR LINK: {url}')
                print(f'{error_type.__name__} occurred on Line {error_info.tb_lineno}: {e}')

        # If articles were collected, add them to the DataFrame
        if rows:
            rows_df = pd.DataFrame(rows)
            balitaData = pd.concat([balitaData, rows_df], ignore_index=True)

        # Move to the next page
        current_page += 1

except Exception as e:
    error_type, error_obj, error_info = sys.exc_info()
    print(f'ERROR: {error_type.__name__} occurred on Line {error_info.tb_lineno}: {e}')

finally:
    driver.quit()

# Drop duplicates and reset index
balitaData.drop_duplicates(inplace=True)
balitaData.reset_index(drop=True, inplace=True)

# Save the file
filename = f"{dir}{site}_{datetime.today().date()}_NEWS_LinkList.csv"
balitaData.to_csv(filename, index=False)

# Display the first few rows of the data
print(balitaData.head())


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Waiting 35.81 seconds before next request...
Found 10 articles on page 1397.
Scraping article: AiAi sa 9th Anniversary nila ni Gerald Sibayan: 'Totoo pala na may lalaking maayos', Date: April 13,  2023
Scraping article: Nawawalang sundalo sa nasunog na barko sa Basilan, natagpuang patay, Date: April 13,  2023
Scraping article: DOH: Omicron subvariant na XBB.1.9.1, natukoy na rin sa Pinas, Date: April 13,  2023
Scraping article: Pilot run ng NCR single ticketing system, sa Mayo 2 na!, Date: April 13,  2023
Scraping article: Hontiveros, pinuri ang pag-isyu ng arrest warrants vs Bantag, Zulueta, Date: April 13,  2023
Scraping article: Dahil sa insidente: Paglalagay ng platform barriers sa train stations, inirekomenda muli ng DOTr, Date: April 13,  2023
Scraping article: MMDA, namamahagi pa rin ng inuming tubig sa mga apektado ng Mindoro oil spill, Date: April 13,  2023
Scraping article: Power failure sa Baclaran Station: Ope

### **Article Scraper**

In [None]:
# Test article scraper. Working.

url = 'https://balita.mb.com.ph/2024/09/05/pbbm-ipinagdiwang-pagbagal-ng-inflation-nitong-agosto-patuloy-ang-trabaho/'

soup = BeautifulSoup(extract_source(url).text, 'html.parser')

Title = soup.find('h1',{'class':'article-title'}).text.strip()
print(Title)

Author = soup.find('div', {'class':'article-byline'}).find('a', class_='author-name').text.strip()
print(Author)

Date = soup.find('div', {'class': 'article-date'}).text.strip()
print(Date)

textList = soup.find('div', {'class':'item-article-body'}).find_all('p',{'class':'article-text'})
body = ""
for t in textList:
    body += (t.text) +"\n"
print(body)

In [None]:
# Function to scrape articles
def balitaArticleScraper(url):
    row = ["", "", "", ""]  # Initialize empty row in case of failure
    try:
        response = extract_source(url)
        if not response:  # Skip if the response is None
            return row

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract Title
        try:
            Title = soup.find('h1', {'class': 'article-title'}).text.strip()
        except AttributeError:
            Title = "Title Not Found"

        # Extract Author
        try:
            Author = soup.find('div', {'class': 'article-byline'}).find('a', class_='author-name').text.strip()
        except AttributeError:
            Author = "Unknown"  # Fallback for missing author

        # Extract Date
        try:
            Date = soup.find('div', {'class': 'article-date'}).text.strip()
        except AttributeError:
            Date = "Date Not Found"

        # Extract Body
        try:
            textList = soup.find('div', {'class': 'item-article-body'}).find_all('p', {'class': 'article-text'})
            if not textList:  # Check if 'p' with 'article-text' class exists
                textList = soup.find('div', {'class': 'item-article-body'}).find_all('p')  # Fallback to all 'p' tags
            body = "\n".join([t.text.strip() for t in textList])
        except AttributeError:
            body = "Body Not Found"

        # Combine extracted data into a row
        row = [Title, Author, Date, body]

    except Exception as e:
        error_type, error_obj, error_info = sys.exc_info()
        print(f"ERROR FOR LINK: {url}")
        print(f"{error_type.__name__} occurred on Line {error_info.tb_lineno}: {e}")

    return row

# Define site and output filename
site = "Balita"
dir = '/content/gdrive/MyDrive/CSCI 199/Methodology/Datasets/Raw/'  # Adjust the directory as needed
filename = f"{dir}{site}_{date.today()}_NEWS.csv"

# Load the article links from the LinkList CSV
linklist_filename = f"{dir}{site}_{date.today()}_NEWS_LinkList.csv"
df = pd.read_csv(linklist_filename)

# List to store article data
rows_list = []

# Iterate through the article links and scrape each one
for index, r in df.iterrows():
    url = r["Link"]
    print(f"Scraping article {index + 1}: {url}")
    row = balitaArticleScraper(url)
    rows_list.append(row)

# Create a DataFrame with the scraped data
balitaData2 = pd.DataFrame(rows_list, columns=["Title", "Author", "Date", "Text"])

# Save the DataFrame to a CSV file
balitaData2.to_csv(filename, index=False)
print(f"Scraped data saved to {filename}")

# Display the first few rows of the DataFrame
print(balitaData2.head())

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Scraping article 331: https://balita.mb.com.ph/2023/05/22/mukantanga-lang-bianca-manalo-may-sagot-sa-basher-ng-pagsayaw/
Scraping article 332: https://balita.mb.com.ph/2023/05/22/lto-chief-tugade-nagbitiw-sa-puwesto/
Scraping article 333: https://balita.mb.com.ph/2023/05/22/wala-pa-rin-%e2%82%b1149m-jackpot-prize-ng-ultra-lotto-6-58-di-napanalunan/
Error occurred for URL: https://balita.mb.com.ph/2023/05/22/wala-pa-rin-%e2%82%b1149m-jackpot-prize-ng-ultra-lotto-6-58-di-napanalunan/ -> Exceeded 30 redirects.
Scraping article 334: https://balita.mb.com.ph/2023/05/22/14-anyos-na-lalaki-patay-matapos-mabangga-ng-tanker/
Scraping article 335: https://balita.mb.com.ph/2023/05/22/salute-maam-guro-sa-negros-occidental-tuloy-sa-pagtuturo-kahit-karga-ang-anak/
Scraping article 336: https://balita.mb.com.ph/2023/05/22/buboy-papalitan-muna-si-boobay-sa-tbats/
Scraping article 337: https://balita.mb.com.ph/2023/05/22/2-pagyanig-9-rock