In [None]:
!pip install BeautifulSoup4

In [None]:
import os
import json
import time
import random
import zipfile
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Class Explanation: `NewsScraper`

## Overview
The `NewsScraper` class is designed for scraping news articles from three different Urdu news websites: Geo, Jang, and Express. The class has methods that cater to each site's unique structure and requirements. Below, we will go through the class and its methods, detailing what each function does, the input it takes, and the output it returns.

## Class Definition

```python
class NewsScraper:
    def __init__(self, id_=0):
        self.id = id_
```


## Method 1: `get_express_articles`

### Description
Scrapes news articles from the Express website across categories like saqafat (entertainment), business, sports, science-technology, and world. The method navigates through multiple pages for each category to gather a more extensive dataset.

### Input
- **`max_pages`**: The number of pages to scrape for each category (default is 7).

### Process
- Iterates over each category and page.
- Requests each category page and finds article cards within `<ul class='tedit-shortnews listing-page'>`.
- Extracts the article's headline, link, and content by navigating through `<div class='horiz-news3-caption'>` and `<span class='story-text'>`.

### Output
- **Returns**: A tuple of:
  - A Pandas DataFrame containing columns: `id`, `title`, and `link`).
  - A dictionary `express_contents` where the key is the article ID and the value is the article content.

### Data Structure
- Article cards are identified by `<li>` tags.
- Content is structured within `<span class='story-text'>` and `<p>` tags.



In [None]:
class NewsScraper:
    def __init__(self,id_=0):
        self.id = id_


  # write functions to scrape from other websites


    def get_express_articles(self, max_pages=7):
        express_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://www.express.pk'
        categories = ['saqafat', 'business', 'sports', 'science', 'world']   # saqafat is entertainment category

        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/{category}/archives?page={page}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Finding article cards
                cards = soup.find('ul', class_='tedit-shortnews listing-page').find_all('li')  # Adjust class as per actual site structure
                print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")

                success_count = 0

                for card in cards:
                    try:
                        div = card.find('div',class_='horiz-news3-caption')

                        # Article Title
                        headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')

                        # Article link
                        link = div.find('a')['href']

                        # Requesting the content from each article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")


                        # Content arranged in paras inside <span> tags
                        paras = content_soup.find('span',class_='story-text').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        express_df['id'].append(self.id)
                        express_df['title'].append(headline)
                        express_df['link'].append(link)
                        express_df['gold_label'].append(category.replace('saqafat','entertainment').replace('science','science-technology'))
                        express_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')

        return pd.DataFrame(express_df)
    
    # each page has about 60 articles hence pages set to 3. 180 articles from each category are taken.
    def get_geo_articles(self, max_pages=3):
        geo_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = "https://urdu.geo.tv/"
        categories = [
            "entertainment",
            "business",
            "sports",
            "science-technology",
            "world",
        ]

        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}category/{category}/{page}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Find all article links in the container
                cards = soup.find_all("a", class_="open-section")
                print(
                    f"\t--> Found {len(cards)} articles on page {page} of '{category}'."
                )

                success_count = 0

                for card in cards:

                    try:
                        # Article title
                        headline = card.get("title", "").strip()

                        # Article link
                        link = card["href"]

                        # Requesting the content from the article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(
                            article_response.text, "html.parser"
                        )

                        # Extract content inside <div class='content-area'>
                        paras = content_soup.find(
                            "div", class_="content-area"
                        ).find_all("p")
                        combined_text = " ".join(
                            p.get_text(strip=True)
                            .replace("\xa0", " ")
                            .replace("\u200b", "")
                            for p in paras
                            if p.get_text(strip=True)
                        )

                        # Storing data
                        geo_df["id"].append(self.id)
                        geo_df["title"].append(headline)
                        geo_df["link"].append(link)
                        geo_df["gold_label"].append(
                            category
                        )  # 'geo' already has science-technology and entertainment so no need to replace that.
                        geo_df["content"].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(
                            f"\t--> Failed to scrape an article on page {page} of '{category}': {e}"
                        )

                print(
                    f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'."
                )
                print("")

        return pd.DataFrame(geo_df)
    
    def get_jang_articles(self, max_pages=7):
        jang_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://jang.com.pk'
        categories_dict = {
            "entertainment": "https://jang.com.pk/category/latest-news/entertainment",
            "business": "https://jang.com.pk/category/latest-news/business",
            "sports": "https://jang.com.pk/category/latest-news/sports",
            "science-and-technology": "https://jang.com.pk/category/magazine/science-and-technology",
            "world": "https://jang.com.pk/category/latest-news/world",
        }

        for category, cat_url in categories_dict.items():
            print(f"Scraping category '{category}'...")
            success_count = 0

            # Special case for Science and Technology category
            if category == "science-and-technology":
                cat_url = "https://jang.com.pk/category/load_more_subcategories?category_id=58&parent_slug=magazine&child_slug=science-and-technology"
                for offset in range(0, max_pages, 1):
                    print(f"Scraping offset: {offset} for 'science-and-technology'...")
                    offset_string = f"&offset={offset}"
                    final_url = cat_url + offset_string
                    try:
                        response = requests.get(final_url)
                        response.raise_for_status()
                        soup = BeautifulSoup(response.text, "html.parser")
                        articles = soup.select("li")

                        if not articles:
                            print(f"\t--> No articles found at offset {offset}. Stopping.")
                            break

                        for article in articles:
                            try:
                                title_tag = article.select_one('.main-heading h3')
                                link_tag = article.find('a')

                                if title_tag and link_tag:
                                    title = title_tag.text.strip()
                                    link = link_tag["href"]

                                    # print(f"\t--> Scraping article: {title}")
                                    # Request and parse the article content
                                    article_response = requests.get(link)
                                    article_response.raise_for_status()
                                    content_soup = BeautifulSoup(article_response.text, "html.parser")
                                    article_selector = 'body > section > div.container > div.detail-right > div.detail-content > div.description-area > div.detail_view_content'
                                    paras = content_soup.select(f"{article_selector} p")

                                    combined_text = " ".join(
                                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                                        for p in paras if p.get_text(strip=True)
                                    )

                                    if not title or not link or not category or not combined_text:
                                        continue
                                    jang_df['id'].append(self.id)
                                    jang_df['title'].append(title)
                                    jang_df['link'].append(link)
                                    jang_df['gold_label'].append("science-technology")
                                    jang_df['content'].append(combined_text)

                                    self.id += 1
                                    success_count += 1
                            except Exception as e:
                                print(f"\t--> Failed to scrape an article at offset {offset}: {e}")
                    except Exception as e:
                        print(f"\t--> Failed to fetch articles at offset {offset}: {e}")
                        break

                print(f"Completed scraping for category 'science-and-technology'.")
                print(f"\t--> Successfully scraped {success_count} articles of '{category}'.")
                continue

            # Regular categories handling
            try:
                response = requests.get(cat_url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")
                articles = soup.select('ul.scrollPaginationNew__ > li')
                print(f"\t--> Found {len(articles)} articles for '{category}'.")
            except Exception as e:
                print(f"\t--> Failed to fetch category '{category}': {e}")
                continue

            for article in articles:
                try:
                    title_tag = article.select_one('.main-heading h2')
                    link_tag = article.select_one('.main-heading a')

                    if title_tag and link_tag:
                        title = title_tag.text.strip()
                        link = link_tag['href']

                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")
                        article_selector = 'body > section > div.container > div.detail-right > div.detail-content > div.description-area > div.detail_view_content'
                        paras = content_soup.select(f"{article_selector} p")

                        combined_text = " ".join(
                            p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                            for p in paras if p.get_text(strip=True)
                        )
                        
                        if not title or not link or not category or not combined_text:
                            continue
                        jang_df['id'].append(self.id)
                        jang_df['title'].append(title)
                        jang_df['link'].append(link)
                        jang_df['gold_label'].append(category)
                        jang_df['content'].append(combined_text)

                        self.id += 1
                        success_count += 1
                except Exception as e:
                    print(f"\t--> Failed to scrape an article of '{category}': {e}")

            print(f"\t--> Successfully scraped {success_count} articles of '{category}'.")
            print('')

        jang_df['content'] = [content.replace(',', '') for content in jang_df['content']]
        return pd.DataFrame(jang_df)
        
    def get_samaa_articles(self, max_pages=7):
        samaa_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://urdu.samaa.tv'
        categories = {
            'lifestyle': 'entertainment',
            'money': 'business',
            'sports': 'sports',
            'tech': 'science-technology',
            'global': 'international'
        }
    
        # Define headers to mimic a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/115.0.0.0 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Referer': base_url
        }
    
        session = requests.Session()
        session.headers.update(headers)
    
        for category, gold_label in categories.items():
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                if page == 1:
                    url = f"{base_url}/{category}"
                else:
                    url = f"{base_url}/{category}?page={page}"
                
                try:
                    response = session.get(url)
                    response.raise_for_status()
                except requests.exceptions.HTTPError as http_err:
                    print(f"\t--> HTTP error occurred: {http_err} for URL: {url}")
                    continue
                except Exception as err:
                    print(f"\t--> Other error occurred: {err} for URL: {url}")
                    continue
    
                soup = BeautifulSoup(response.text, "html.parser")
    
                #to find articles on each page
                articles = soup.find_all('article', class_='story-article')  # Update this selector
    
                print(f"\t--> Found {len(articles)} articles on page {page} of '{category}' at {url}.")
    
                success_count = 0
    
                for article in articles:
                    try:
                        # Extract the title and link
                        a_tag = article.find('a')
                        if not a_tag:
                            continue
                            
                        # The article title was the same as image alt-text
                        img_tag = a_tag.find('img', alt=True)
                        if img_tag and img_tag.has_attr('alt'):
                            title = img_tag['alt'].strip().replace('\xa0', ' ')
#                             print(f"Now saving: {title}")
                        else:
                            print("\t--> <img> tag with 'alt' attribute not found in <a> tag; skipping.")
                            continue  # Skip this article if <img> tag or 'alt' attribute is missing

                        
                        link = a_tag['href']
                        if not link.startswith('http'):
                            link = base_url + link
    
                        # Fetch the article content
                        article_response = session.get(link)
                        article_response.raise_for_status()
                        article_soup = BeautifulSoup(article_response.text, "html.parser")
    
                        # To find the main content within each article
                        content_div = article_soup.find('div', class_='article-content')  # Update this selector
                        if not content_div:
                            raise ValueError("Content div not found")
    
                        paragraphs = content_div.find_all('p')
                        combined_text = " ".join(
                            p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                            for p in paragraphs if p.get_text(strip=True)
                        )
    
                        # Store the data
                        samaa_df['id'].append(self.id)
                        samaa_df['title'].append(title)
                        samaa_df['link'].append(link)
                        samaa_df['gold_label'].append(gold_label)
                        samaa_df['content'].append(combined_text)
    
                        # Increment ID and success count
                        self.id += 1
                        success_count += 1
    
                        # To be polite to the server, sleep for a random short duration
                        time.sleep(random.uniform(0.5, 1.5))
    
                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}', Error: {e}")
    
                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')  # Add a newline for better readability between categories
    
        return pd.DataFrame(samaa_df)
    

In [None]:
scraper = NewsScraper()

### Create DFs

In [None]:
#max_pages has been set to 1 here for quick testing
#we scraped with ranges between 5 - 10 depending on how many articles were on one page for each publisher

print("Now scraping Express: ")
express_df = scraper.get_express_articles(max_pages=1)
print("Now scraping Geo: ")
geo_df = scraper.get_geo_articles(max_pages=1)
print("Now scraping Jang: ")
jang_df = scraper.get_jang_articles(max_pages=1)
print("Now scraping Samaa: ")
samaa_df = scraper.get_samaa_articles(max_pages=1)
print("Scraping complete. ")

# Output
- Save a combined csv of all 3 sites.

In [None]:
df_list = []

df_list.append(express_df)
df_list.append(geo_df)
df_list.append(jang_df)
df.list.append(samaa_df)
combined_df = pd.concat(df_list, ignore_index=True)

combined_df.to_excel('all_data.csv', index=False)
print("All data has been saved as all_data.csv")