In [None]:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import html
import re

# Base URL with pagination pattern
base_url = "https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page={}&sort_by=4"

# Total number of pages to scrape
total_pages = 185  # Adjust this value as needed

# List to store extracted data
data = []

# Headers to mimic a real browser (to prevent blocking)
headers = {"User-Agent": "Mozilla/5.0"}

# Regex pattern to identify valid fee structure (₹ 2.80 L, 2.80 L, 2.80 K, ₹ 2.80 K)
fee_pattern = re.compile(r'₹?\s?\d+(\.\d{1,2})?\s?[L|K]', re.IGNORECASE)

# Regex patterns to detect duration and seat related data
duration_keywords = re.compile(r'(years?|duration)', re.IGNORECASE)
seats_pattern = re.compile(r'\d+')  # For numeric seats data

# Function to scrape a single page
def scrape_page(page):
    url = base_url.format(page)
    print(f"Scraping page {page} of {total_pages} -- {url}")
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Find all college listing blocks
        college_cards = soup.find_all("div", class_="card_block")
        page_data = []

        for card in college_cards:
            # Extract college name
            college_name_tag = card.find("h3")
            college_name = college_name_tag.a.text.strip() if college_name_tag and college_name_tag.a else "N/A"

            # Extract the college location
            location_tag = card.find("div", class_="content_block d-block d-md-none")
            location = location_tag.find('span').text.strip() if location_tag else "N/A"

            # Extract the college rating
            rating_tag = card.find("span", class_="star_text")
            rating = " ".join(rating_tag.stripped_strings) if rating_tag else "N/A"

            # Extract whether the college is private or government
            college_type_tag = card.find("div", class_="content_block d-none d-md-block d-md-flex flex-row justify-content-between")
            college_type = college_type_tag.find_all("span")[1].text.strip() if college_type_tag else "N/A"

            # Extract the link to the facilities page
            facilities_url = None
            facilities_section = card.find("div", class_="d-none d-md-block")
            if facilities_section:
                links = facilities_section.find_all("a")
                for link in links:
                    href = link.get('href')
                    if href and "facilities" in href:
                        if not href.startswith("http"):
                            href = "https://www.careers360.com" + href
                        facilities_url = href
                        break

            facilities = []
            if facilities_url:
                facility_page_response = requests.get(facilities_url, headers=headers)
                if facility_page_response.status_code == 200:
                    facility_page_soup = BeautifulSoup(facility_page_response.content, 'html.parser')
                    facility_tags = facility_page_soup.find_all('span', class_='facilities_name')
                    for facility in facility_tags:
                        facilities.append(facility.text.strip())

            # Extract course data (courses, fee structure, duration, and seats)
            course_names, fee_structures, durations, seats = [], [], [], []

            course_list_tag = card.find('div', class_="combined_block d-md-none")
            if course_list_tag:
                course_links = course_list_tag.find_all('a')

                for course_link in course_links:
                    course_url = course_link.get('href')
                    if course_url:
                        course_page_response = requests.get(course_url)
                        course_page_soup = BeautifulSoup(course_page_response.content, 'html.parser')
                        courses_on_page = course_page_soup.find_all('div', class_='detail')

                        for course_div in courses_on_page:
                            h4_tag = course_div.find('h4')
                            course_name = h4_tag.a.text.strip() if h4_tag and h4_tag.a else "N/A"
                            course_names.append(course_name)

                            fee_structure = "N/A"
                            course_detail_tag = course_div.find('div', class_='course_detail')
                            if course_detail_tag:
                                fee_span = course_detail_tag.find_all('span')
                                for span in fee_span:
                                    fee_text = span.text.strip()
                                    fee_text = html.unescape(fee_text)
                                    if fee_pattern.match(fee_text):
                                        fee_structure = "₹ " + fee_text.replace("₹", "").strip()
                                        break

                                duration, seats_count = "N/A", "N/A"
                                for span in fee_span:
                                    span_text = span.text.strip()
                                    if duration_keywords.search(span_text):
                                        duration = span_text
                                    elif seats_pattern.match(span_text):
                                        seats_count = span_text

                                durations.append(duration)
                                seats.append(seats_count)

                            fee_structures.append(fee_structure)

            for course_name, fee_structure, duration, seat in zip(course_names, fee_structures, durations, seats):
                page_data.append({
                    "College Name": college_name,
                    "Location": location,
                    "Courses": course_name,
                    "Duration": duration,
                    "Fee Structure": fee_structure,
                    "College Type": college_type,
                    "Seats": seat,
                    "Facilities": ', '.join(facilities),
                    "Rating": rating
                })
        return page_data
    else:
        print(f"Failed to fetch page {page}. Status Code: {response.status_code}")
        return []

# Execute scraping concurrently with ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(scrape_page, page) for page in range(1, total_pages + 1)]
    for future in concurrent.futures.as_completed(futures):
        data.extend(future.result())

# Convert the data to a pandas DataFrame and reorder columns as per the required structure
df = pd.DataFrame(data)
df = df[["College Name", "Location", "Courses", "Duration", "Fee Structure", "College Type", "Seats", "Facilities", "Rating"]]  # Column order

# Save the data to a CSV file for further analysis with UTF-8-SIG encoding
df.to_csv("Engineering_Data.csv", index=False, encoding="utf-8-sig")

print("Scraping completed! Data saved to Engineering_Data.csv")
print(df.head())


Scraping page 1 of 5 -- https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page=1&sort_by=4
Scraping page 2 of 5 -- https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page=2&sort_by=4
Scraping page 3 of 5 -- https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page=3&sort_by=4
Scraping page 4 of 5 -- https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page=4&sort_by=4
Scraping page 5 of 5 -- https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page=5&sort_by=4
Scraping completed! Data saved to demo.csv
                                        College Name  \
0  AAA College of Engineering and Technology, Siv...   
1  AAA College of Engineering and Technology, Siv...   
2  AAA College of Engineering and Technology, Siv...   
3  AAA College of Engineering and Technology, Siv...   
4  AAA College of Engineering and Technology, Siv...   

     