<a href="https://colab.research.google.com/github/theabdulbasitt/scrapping_poetry_from_rekhta/blob/main/scraping_rekhta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import time

# Define headers to mimic a regular browser request
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    )
}

# Base URL for Rekhta and a sample page URL that lists poetry
BASE_URL = "https://www.rekhta.org"
# Example: a page that lists shayari (poetry)
PAGE_URL = "/shayari"  # Adjust this path as necessary

def fetch_page(url):
    """
    Fetch the HTML content of the given URL.
    """
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            print("Error: Received status code", response.status_code)
    except Exception as e:
        print("Error fetching the page:", e)
    return None

def parse_poems(html):
    """
    Parse the HTML to extract poems.
    Adjust the tag names and class selectors based on the website's inspect code.
    """
    soup = BeautifulSoup(html, 'html.parser')
    poems = []

    # Example: assuming each poem is contained within a <div> with class "poetry-text"
    # You should inspect the Rekhta website to find the exact tag and class/ID.
    poem_divs = soup.find_all("div", class_="poetry-text")  # <-- UPDATE selector as needed

    for div in poem_divs:
        # Get the text of the poem; you might need additional cleaning if there are nested tags
        poem_text = div.get_text(separator="\n", strip=True)
        poems.append(poem_text)
    return poems

def main():
    full_url = BASE_URL + PAGE_URL
    html_content = fetch_page(full_url)
    if html_content:
        poems = parse_poems(html_content)
        print("Found {} poems on the page.".format(len(poems)))
        # Save or process the poems as needed; here, we simply print them.
        for idx, poem in enumerate(poems, 1):
            print(f"\n--- Poem {idx} ---")
            print(poem)
            print("-" * 50)
    else:
        print("Failed to retrieve content from the page.")

if __name__ == "__main__":
    main()


Found 0 poems on the page.
