In [2]:
# %pip install BeautifulSoup

In [1]:
import os
import json
import time
import random
import zipfile
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Class Explanation: `NewsScraper`

## Overview
The `NewsScraper` class is designed for scraping news articles from three different Urdu news websites: Geo, Jang, and Express. The class has methods that cater to each site's unique structure and requirements. Below, we will go through the class and its methods, detailing what each function does, the input it takes, and the output it returns.

## Class Definition

```python
class NewsScraper:
    def __init__(self, id_=0):
        self.id = id_
```


## Method 1: `get_express_articles`

### Description
Scrapes news articles from the Express website across categories like saqafat (entertainment), business, sports, science-technology, and world. The method navigates through multiple pages for each category to gather a more extensive dataset.

### Input
- **`max_pages`**: The number of pages to scrape for each category (default is 7).

### Process
- Iterates over each category and page.
- Requests each category page and finds article cards within `<ul class='tedit-shortnews listing-page'>`.
- Extracts the article's headline, link, and content by navigating through `<div class='horiz-news3-caption'>` and `<span class='story-text'>`.

### Output
- **Returns**: A tuple of:
  - A Pandas DataFrame containing columns: `id`, `title`, and `link`).
  - A dictionary `express_contents` where the key is the article ID and the value is the article content.

### Data Structure
- Article cards are identified by `<li>` tags.
- Content is structured within `<span class='story-text'>` and `<p>` tags.



In [4]:
class NewsScraper:
    def __init__(self,id_=0):
        self.id = id_


  # write functions to scrape from other websites


    def get_express_articles(self, max_pages=7):
        express_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://www.express.pk'
        categories = ['saqafat', 'business', 'sports', 'science', 'world']   # saqafat is entertainment category

        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/{category}/archives?page={page}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Finding article cards
                cards = soup.find('ul', class_='tedit-shortnews listing-page').find_all('li')  # Adjust class as per actual site structure
                print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")

                success_count = 0

                for card in cards:
                    try:
                        div = card.find('div',class_='horiz-news3-caption')

                        # Article Title
                        headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')

                        # Article link
                        link = div.find('a')['href']

                        # Requesting the content from each article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")


                        # Content arranged in paras inside <span> tags
                        paras = content_soup.find('span',class_='story-text').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        express_df['id'].append(self.id)
                        express_df['title'].append(headline)
                        express_df['link'].append(link)
                        express_df['gold_label'].append(category.replace('saqafat','entertainment').replace('science','science-technology'))
                        express_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')

        return pd.DataFrame(express_df)

    def get_geo_article(self):
        geo_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = "https://urdu.geo.tv/"
        categories = [
            "entertainment",
            "business",
            "sports",
            "science-technology",
            "world",
        ]

        # Iterating over the specified number of pages
        for category in categories:

            print(f"Scraping category '{category}'...")
            url = f"{base_url}/category/{category}/"
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Finding article cards
            cards = soup.find_all(
                "div", class_="col-xs-6 col-sm-6 col-lg-6 col-md-6 singleBlock"
            )  # Adjust class as per actual site structure
            print(f"\t--> Found {len(cards)} articles of '{category}'.")
            success_count = 0

            for card in cards:
                try:
                    # Locate the nested <a> tag
                    link_tag = card.find("a", class_="open-section")

                    # Extract the title from the 'title' attribute of the <a> tag
                    headline = link_tag["title"].strip()
                    # Extract the href (link) from the <a> tag
                    link = link_tag["href"].strip()

                    # Requesting the content from each article's link
                    article_response = requests.get(link)
                    article_response.raise_for_status()
                    content_soup = BeautifulSoup(article_response.text, "html.parser")

                    # Content is arranged in paras inside <div class='storyDetail'>
                    paras = content_soup.find("div", class_="content-area").find_all(
                        "p"
                    )
                    combined_text = " ".join(
                        p.get_text(strip=True)
                        .replace("\xa0", " ")
                        .replace("\u200b", "")
                        for p in paras
                        if p.get_text(strip=True)
                    )

                    # Store the scraped data
                    geo_df["id"].append(self.id)
                    geo_df["title"].append(headline)
                    geo_df["link"].append(link)
                    geo_df["gold_label"].append(category)
                    geo_df["content"].append(combined_text)

                    # Increment ID and success count
                    self.id += 1
                    success_count += 1

                except Exception as e:
                    print(f"\t--> Failed to scrape an article of '{category}': {e}")

            print(
                f"\t--> Successfully scraped {success_count} articles from'{category}'."
            )
        return pd.DataFrame(geo_df)
    
    def get_jang_article(self):
        jang_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = "https://jang.com.pk/category/latest-news/"
        categories = [
            "sports",
            "business",
            "entertainment",
            "world",
            "science-technology",
        ]

        # Iterating over the specified categories
        for category in categories:
            print(f"Scraping category '{category}'...")
            url = f"{base_url}{category}/"
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Finding article list items
            articles = soup.find("ul", class_="scrollPaginationNew__").find_all("li")
            print(f"\t--> Found {len(articles)} articles in '{category}' category.")
            success_count = 0

            for article in articles:
                try:
                    # Locate the nested <a> tag within the article
                    link_tag = article.find("a")
                    if not link_tag:
                        continue

                    # Extract the href (link) from the <a> tag
                    link = link_tag["href"].strip()

                    #article_id = link.split("/")[-1]
                    # Requesting the content from each article's link
                    main_heading = article.find("div", class_="main-heading")
                    title_tag = main_heading.find("h2") if main_heading else None
                    headline = title_tag.get_text(strip=True) if title_tag else "No Title"


                    article_response = requests.get(link)
                    article_response.raise_for_status()
                    content_soup = BeautifulSoup(article_response.text, "html.parser")


                    # Extracting content from <div class='main-heading'> in the article
                    content_div = content_soup.find("div", class_="detail_view_content")
                    if content_div:
                        paras = content_div.find_all("p")
                        combined_text = " ".join(
                            p.get_text(strip=True)
                            for p in paras
                            if p.get_text(strip=True)
                        )
                    else:
                        combined_text = "No Content"
                    # Store the scraped data
                    jang_df["id"].append(self.id)
                    jang_df["title"].append(headline)
                    jang_df["link"].append(link)
                    jang_df["gold_label"].append(category)
                    jang_df["content"].append(combined_text)

                    # Increment ID and success count
                    self.id += 1
                    success_count += 1

                except Exception as e:
                    print(f"\t--> Failed to scrape an article of '{category}': {e}")

            print(f"\t--> Successfully scraped {success_count} articles from '{category}'.")
        return pd.DataFrame(jang_df)

In [5]:
scraper = NewsScraper()

In [6]:
express_df = scraper.get_express_articles()
geo_df = scraper.get_geo_article()
jang_df = scraper.get_jang_article()

Scraping page 1 of category 'saqafat'...
	--> Found 10 articles on page 1 of 'saqafat'.
	--> Failed to scrape an article on page 1 of 'saqafat': 404 Client Error: Not Found for url: https://www.express.pk/story/2735209/akshaykumar-ki-film-housefull5-ki-shooting-iktitami-marahil-main-dakhil-2735209
	--> Failed to scrape an article on page 1 of 'saqafat': 404 Client Error: Not Found for url: https://www.express.pk/story/2735208/amirkhan-ke-mashware-par-lahore1947-main-emotional-seens-shamil-2735208
	--> Failed to scrape an article on page 1 of 'saqafat': 404 Client Error: Not Found for url: https://www.express.pk/story/2735207/rashmika-mandana-ka-film-pushpa2-ke-set-ko-emotional-alwida-2735207
	--> Failed to scrape an article on page 1 of 'saqafat': 404 Client Error: Not Found for url: https://www.express.pk/story/2735196/napa-ka-bara-elan-radiodrama-ki-riwayat-ko-zinda-kia-jayega-2735196
	--> Successfully scraped 6 articles from page 1 of 'saqafat'.
Scraping page 2 of category 'saqafat'

# Output
- Save a combined csv of all 3 sites.

In [7]:

combined_df = pd.concat([express_df, geo_df,jang_df], axis=0, ignore_index=True)
jang_df.to_csv('jang_articles.csv', index=False)
print('Data saved to jang_articles.csv')
geo_df.to_csv('geo_articles.csv', index=False)
print('Data saved to geo_articles.csv')
express_df.to_csv('express_articles.csv', index=False)
print('Data saved to express_articles.csv')
combined_df.to_csv('urdu_articles.csv', index=False)
print('Data saved to urdu_articles.csv')

combined_df.head()


Data saved to jang_articles.csv
Data saved to geo_articles.csv
Data saved to express_articles.csv
Data saved to urdu_articles.csv


Unnamed: 0,id,title,link,content,gold_label
0,0,'باہوبلی 2' کے اداکار 47 سال کی عمر میں شادی ک...,https://www.express.pk/story/2735199/bahubali2...,بالی ووڈ اور ساؤتھ انڈین فلم انڈسٹری کے معروف ...,entertainment
1,1,نیٹ فلکس سیریز تنازع : دھنش نے نینتھارا کے خلا...,https://www.express.pk/story/2735191/netflixse...,تامل فلم انڈسٹری کے دو مشہور ستاروں، دھنش اور ...,entertainment
2,2,گلیڈی ایٹر 2' نے باکس آفس پر تہلکہ مچایا، 106 ...,https://www.express.pk/story/2735188/gladiator...,"ہالی وڈ کی دو بڑی فلموں، میوزیکل ایڈاپٹیشن ""وِ...",entertainment
3,3,والد چنکی پانڈے طویل عرصے تک بےروزگار رہے، انن...,https://www.express.pk/story/2735173/chinkypan...,ممبئی - بالی وڈ اداکارہ اننیا پانڈے نے انکشاف ...,entertainment
4,4,رشی کپور نے فلم ’برفی‘ پر بیٹے رنبیر کو کیا مش...,https://www.express.pk/story/2735167/rishikapo...,ممبئی - آنجہانی اداکار رشی کپور نے اپنی صاف گو...,entertainment
