<a href="https://colab.research.google.com/github/theabdulbasitt/scrapping_poetry_from_rekhta/blob/main/scraping_rekhta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
from bs4 import BeautifulSoup
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

BASE_URL = "https://www.rekhta.org"
PAGE_URL = "/shayari"

def fetch_page(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error fetching page: {e}")
        return None

def parse_poems(html):
    soup = BeautifulSoup(html, 'html.parser')
    poems = []

    # Find all poetry items in the list
    poem_items = soup.find_all('div', class_='contentListItems')

    for item in poem_items:
        # Extract Roman Urdu text (class 'rt' contains Roman transcription)
        roman_lines = item.find_all('span', class_='rt')

        # Clean and join lines
        poem_text = '\n'.join([line.get_text(strip=True) for line in roman_lines])

        if poem_text:
            poems.append(poem_text)

    return poems

def main():
    full_url = BASE_URL + PAGE_URL
    html_content = fetch_page(full_url)

    if html_content:
        poems = parse_poems(html_content)
        print(f"Found {len(poems)} poems on the page.")

        for idx, poem in enumerate(poems, 1):
            print(f"\n--- Poem {idx} ---")
            print(poem)
            print("-" * 50)
    else:
        print("Failed to retrieve content from the page.")

if __name__ == "__main__":
    main()

Found 0 poems on the page.
