<a href="https://colab.research.google.com/github/utkarshg1/DL-2pm-4pm-Weekend/blob/main/WikiScraper_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pydantic import BaseModel, HttpUrl, AfterValidator, ValidationError
import requests
from bs4 import BeautifulSoup
from typing_extensions import Annotated

In [2]:
def validate_url(v: HttpUrl) -> str:
  if not str(v).startswith("https://en.wikipedia.org/wiki/"):
    raise ValueError("Not a Wikipedia URL")
  return v

In [3]:
WikiUrl = Annotated[HttpUrl, AfterValidator(validate_url)]

In [4]:
class WikiScraper(BaseModel):
  url: WikiUrl

  def fetch_page(self):
    r = requests.get(self.url)
    r.raise_for_status()
    return r.content

  def get_soup(self):
    content = self.fetch_page()
    soup = BeautifulSoup(content, 'html.parser')
    return soup

  def get_title(self):
    soup = self.get_soup()
    return soup.title.string

  def get_image_urls(self) -> list:
    soup = self.get_soup()
    image_urls = []
    a = soup.find_all('a', class_="mw-file-description")
    for i in a:
      img = i.get('href')
      image_urls.append("https://en.wikipedia.org/"+img)
    return image_urls

In [5]:
try:
  w1 = WikiScraper(url="https://en.wikipedia.org/wiki/World_population")
  print(w1.get_title())
except Exception as e:
  print(f"Exception occured : {e}")

World population - Wikipedia


In [8]:
w1.get_image_urls()

['https://en.wikipedia.org//wiki/File:World_Population_Prospects.svg',
 'https://en.wikipedia.org//wiki/File:Illustration_of_contemporary_and_past_human_populations_Our_World_in_Data.png',
 'https://en.wikipedia.org//wiki/File:2006megacities.svg',
 'https://en.wikipedia.org//wiki/File:Expectancy_of_life.svg',
 'https://en.wikipedia.org//wiki/File:Population_pyramid_of_the_world_in_continental_groupings_2023.svg',
 'https://en.wikipedia.org//wiki/File:Global_population_cartogram.png',
 'https://en.wikipedia.org//wiki/File:People%27s_-Km%C2%B2_for_all_countries_(and_us_states,_uk_kingdoms).png',
 'https://en.wikipedia.org//wiki/File:Top_5_Country_Population_Graph_1901_to_2021.svg',
 'https://en.wikipedia.org//wiki/File:Population_Density,_v4.11,_2020_(48009093621).jpg',
 'https://en.wikipedia.org//wiki/File:World_population_(UN).svg',
 'https://en.wikipedia.org//wiki/File:Total_Fertility_Rate_Map_by_Country.svg',
 'https://en.wikipedia.org//wiki/File:World_population_counter,_Eureka,_Hal