In [2]:
import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel, ValidationError
from typing import List, Optional

In [3]:
# Pydantic models for college details

class College(BaseModel):
    name: str
    affiliation: Optional[str] = None # optional since they may not always exist
    address: Optional[str] = None 
    details: List[str]

In [10]:
def scrape_college():
    url = requests.get('https://edusanjal.com/college')

    # Checking if the url was successful or not

    if url.status_code != 200:
        raise Exception("Failed to load the page!")
    
    html_content = BeautifulSoup(url.content, "html.parser")

    # Finding college div

    container = html_content.find('div', class_='container')

    if not container:
        raise Exception("No container found")
    

    main = container.find_all('div')
    colleges = main[1].find_all('div')

    # Initializing a dictionary to store college information

    college_info = {}

    # Iterating through the list of colleges

    for college in colleges:
        names = college.find('a')  # find the first <a> tag (college name)
        details = college.find_all('li')  # find all <li> tags (details)

        if names and names.text.strip():
            college_name = names.text.strip()
            detail_texts = [detail.text.strip() for detail in details]

            try:
                # Creating and Validating the College Model using Pydantic
                college_data = College (
                    name = college_name,
                    affiliation=detail_texts[0] if len(detail_texts) > 0 else None,
                    address=detail_texts[1] if len(detail_texts) > 1 else None,
                    details=detail_texts
                )

                # Storing the validated college data
                college_info[college_name] = college_data
        
            except ValidationError as e:
                print(f"Validation error for {college_name}: {e}")
                continue
    return college_info

In [13]:
college_info = scrape_college()

# Print the collected college information
for college_id, info in college_info.items():
    # Print the name
    print(f"Name: {info.name}")
    # Print the details
    print(f"Affiliation: {info.affiliation}")
    print(f"Address: {info.address}")
    
    print()  # Blank line for better readability

Name: Thames International College
Affiliation: Tribhuvan University
Address: Surya Bikram Gyawali Marg, Old Baneshwor, Kathmandu

Name: Texas College of Management and IT
Affiliation: Lincoln University College
Address: Siphal, Kathmandu

Name: Ace Institute of Management
Affiliation: Pokhara University
Address: New Baneshwor, Kathmandu

Name: NCTTM - IST College
Affiliation: Tribhuvan University
Address: Gyaneshwor, Opposite to German Embassy, Kathmandu

Name: Kathford International College of Engineering and Management
Affiliation: Tribhuvan University
Address: Balkumari, Lalitpur

Name: Nepal College of Information Technology (NCIT)
Affiliation: Pokhara University
Address: Balkumari, Lalitpur

Name: Padmashree College
Affiliation: Tribhuvan University, Nilai University, Malaysia
Address: Tinkune, Kathmandu

Name: Liberty College
Affiliation: Pokhara University
Address: Pragati Marg-2, Anamnagar, Kathmandu

Name: Certified College of Accountancy (CCA)
Affiliation: ACCA
Address: Thap