In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from urllib.parse import quote_plus

In [2]:
states = [
    "andhra-pradesh", "arunachal-pradesh", "assam", "bihar", "chhattisgarh", "goa",
    "gujarat", "haryana", "himachal-pradesh", "jharkhand", "karnataka", "kerala",
    "madhya-pradesh", "maharashtra", "manipur", "meghalaya", "mizoram", "nagaland",
    "odisha", "punjab", "rajasthan", "sikkim", "tamil-nadu", "telangana", "tripura",
    "uttar-pradesh", "uttarakhand", "west-bengal", "andaman-and-nicobar-islands", 
    "delhi-ncr", "puducherry"
]

In [3]:
base_url = "https://www.getmyuni.com/all-colleges?state="

In [4]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
}

In [5]:
college_names = []
locations = []
college_types = []
ratings = []
courses = []
tuition_start_range = []
tuition_end_range = []

In [6]:
for state in states:
    state_url = base_url + quote_plus(state) 
    
    try:
        response = requests.get(state_url, headers=headers)
        response.raise_for_status()  

        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all college cards
        college_cards = soup.find_all("div", class_="college__card__new")
        fees = soup.find_all('span', class_='highlight__value')

        # Process fee ranges
        for fee in fees:
            text = fee.get_text(strip=True)
            if '₹' in text:
                amounts = text.replace('₹', '').replace(',', '').split('-')
                if len(amounts) == 2:
                    tuition_start_range.append(amounts[0].strip())
                    tuition_end_range.append(amounts[1].strip())
                elif len(amounts) == 1:
                    tuition_start_range.append(amounts[0].strip())
                    tuition_end_range.append("")
                else:
                    tuition_start_range.append("")
                    tuition_end_range.append("")

        # Extract college information for each card
        for card in college_cards:
            # College name
            name = card.find("h2", class_="college__name")
            college_names.append(name.get_text(strip=True) if name else "N/A")

            # Location
            location = card.find("span", class_="list__style college__location")
            locations.append(location.get_text(strip=True) if location else "N/A")

            # College type
            college_type = card.find("span", class_="list__style college__affiliation")
            college_types.append(college_type.get_text(strip=True) if college_type else "N/A")

            # Rating
            rating = card.find("span", class_="list__style college__rating")
            ratings.append(rating.get_text(strip=True) if rating else "N/A")

            # Courses offered (as number)
            course = card.find("span", class_="highlight__value")
            if course:
                try:
                    number = ''.join(filter(str.isdigit, course.get_text(strip=True)))
                    courses.append(int(number) if number else 0)
                except ValueError:
                    courses.append(0)
            else:
                courses.append(0)

    except Exception as e:
        print(f"Error processing state {state}: {e}")

In [7]:
x = {
    "College Name": college_names[:len(tuition_start_range)],
    "Location": locations[:len(tuition_start_range)],
    "Type": college_types[:len(tuition_start_range)],
    "Rating": ratings[:len(tuition_start_range)],
    "Courses offered": courses[:len(tuition_start_range)],
    "Fee Start Range": tuition_start_range,
    "Fee End Range": tuition_end_range
}

In [8]:
df = pd.DataFrame(x)
df

Unnamed: 0,College Name,Location,Type,Rating,Courses offered,Fee Start Range,Fee End Range
0,Andhra University,"Visakhapatnam, Andhra Pradesh",Public,3.6,34,16 K,14.44 L
1,KL University,"Guntur, Andhra Pradesh",Private,3.5,23,1.40 L,15.60 L
2,Sri Venkateswara University (SVU),"Tirupati, Andhra Pradesh",Public,3.5,18,8 K,2.71 L
3,Aditya Engineering College,"East Godavari, Andhra Pradesh",Private,3.7,5,54 K,2.01 L
4,GPREC,"Kurnool, Andhra Pradesh",Private,3.9,3,1.14 L,3.20 L
...,...,...,...,...,...,...,...
427,"Aizawl Theological College, Aizawl","Aizawl, Mizoram",Private,,3,45 K,5 L
428,Pachhunga University College,"Aizawl, Mizoram",Public,3.3,3,28 K,75 K
429,Mizoram University,"Aizawl, Mizoram",Public,3.5,7,10 K,25 K
430,NIT Mizoram,"Aizawl, Mizoram",Public,,3,30 K,1.25 L


In [9]:
df.to_csv('all_colege_list.csv', index=False)

print("Data saved to output_file.csv")


Data saved to output_file.csv


In [10]:
#  data cleaning
import pandas as pd
import numpy as np

In [11]:
df=pd.read_csv(r"C:\Users\HP\Downloads\excellenc\project 2\all_colege_list.csv")
df

Unnamed: 0,College Name,Location,Type,Rating,Courses offered,Fee Start Range,Fee End Range
0,Andhra University,"Visakhapatnam, Andhra Pradesh",Public,3.6,34,16 K,14.44 L
1,KL University,"Guntur, Andhra Pradesh",Private,3.5,23,1.40 L,15.60 L
2,Sri Venkateswara University (SVU),"Tirupati, Andhra Pradesh",Public,3.5,18,8 K,2.71 L
3,Aditya Engineering College,"East Godavari, Andhra Pradesh",Private,3.7,5,54 K,2.01 L
4,GPREC,"Kurnool, Andhra Pradesh",Private,3.9,3,1.14 L,3.20 L
...,...,...,...,...,...,...,...
427,"Aizawl Theological College, Aizawl","Aizawl, Mizoram",Private,,3,45 K,5 L
428,Pachhunga University College,"Aizawl, Mizoram",Public,3.3,3,28 K,75 K
429,Mizoram University,"Aizawl, Mizoram",Public,3.5,7,10 K,25 K
430,NIT Mizoram,"Aizawl, Mizoram",Public,,3,30 K,1.25 L


In [12]:
df.head()

Unnamed: 0,College Name,Location,Type,Rating,Courses offered,Fee Start Range,Fee End Range
0,Andhra University,"Visakhapatnam, Andhra Pradesh",Public,3.6,34,16 K,14.44 L
1,KL University,"Guntur, Andhra Pradesh",Private,3.5,23,1.40 L,15.60 L
2,Sri Venkateswara University (SVU),"Tirupati, Andhra Pradesh",Public,3.5,18,8 K,2.71 L
3,Aditya Engineering College,"East Godavari, Andhra Pradesh",Private,3.7,5,54 K,2.01 L
4,GPREC,"Kurnool, Andhra Pradesh",Private,3.9,3,1.14 L,3.20 L


In [13]:
df.tail()

Unnamed: 0,College Name,Location,Type,Rating,Courses offered,Fee Start Range,Fee End Range
427,"Aizawl Theological College, Aizawl","Aizawl, Mizoram",Private,,3,45 K,5 L
428,Pachhunga University College,"Aizawl, Mizoram",Public,3.3,3,28 K,75 K
429,Mizoram University,"Aizawl, Mizoram",Public,3.5,7,10 K,25 K
430,NIT Mizoram,"Aizawl, Mizoram",Public,,3,30 K,1.25 L
431,Nagaland University Kohima,"Kohima, Nagaland",Public,3.3,8,3 K,88 K


In [14]:
df[['City', 'State']] = df['Location'].str.split(', ', expand=True)
df = df.drop('Location', axis=1)
df

Unnamed: 0,College Name,Type,Rating,Courses offered,Fee Start Range,Fee End Range,City,State
0,Andhra University,Public,3.6,34,16 K,14.44 L,Visakhapatnam,Andhra Pradesh
1,KL University,Private,3.5,23,1.40 L,15.60 L,Guntur,Andhra Pradesh
2,Sri Venkateswara University (SVU),Public,3.5,18,8 K,2.71 L,Tirupati,Andhra Pradesh
3,Aditya Engineering College,Private,3.7,5,54 K,2.01 L,East Godavari,Andhra Pradesh
4,GPREC,Private,3.9,3,1.14 L,3.20 L,Kurnool,Andhra Pradesh
...,...,...,...,...,...,...,...,...
427,"Aizawl Theological College, Aizawl",Private,,3,45 K,5 L,Aizawl,Mizoram
428,Pachhunga University College,Public,3.3,3,28 K,75 K,Aizawl,Mizoram
429,Mizoram University,Public,3.5,7,10 K,25 K,Aizawl,Mizoram
430,NIT Mizoram,Public,,3,30 K,1.25 L,Aizawl,Mizoram


In [15]:
df.isna()

Unnamed: 0,College Name,Type,Rating,Courses offered,Fee Start Range,Fee End Range,City,State
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
427,False,False,True,False,False,False,False,False
428,False,False,False,False,False,False,False,False
429,False,False,False,False,False,False,False,False
430,False,False,True,False,False,False,False,False


In [16]:
df.isna().sum()

College Name         0
Type                 4
Rating             106
Courses offered      0
Fee Start Range      0
Fee End Range       55
City                 0
State                0
dtype: int64

In [17]:
df=df.fillna(0)
df

Unnamed: 0,College Name,Type,Rating,Courses offered,Fee Start Range,Fee End Range,City,State
0,Andhra University,Public,3.6,34,16 K,14.44 L,Visakhapatnam,Andhra Pradesh
1,KL University,Private,3.5,23,1.40 L,15.60 L,Guntur,Andhra Pradesh
2,Sri Venkateswara University (SVU),Public,3.5,18,8 K,2.71 L,Tirupati,Andhra Pradesh
3,Aditya Engineering College,Private,3.7,5,54 K,2.01 L,East Godavari,Andhra Pradesh
4,GPREC,Private,3.9,3,1.14 L,3.20 L,Kurnool,Andhra Pradesh
...,...,...,...,...,...,...,...,...
427,"Aizawl Theological College, Aizawl",Private,0.0,3,45 K,5 L,Aizawl,Mizoram
428,Pachhunga University College,Public,3.3,3,28 K,75 K,Aizawl,Mizoram
429,Mizoram University,Public,3.5,7,10 K,25 K,Aizawl,Mizoram
430,NIT Mizoram,Public,0.0,3,30 K,1.25 L,Aizawl,Mizoram


In [18]:
df.dtypes

College Name        object
Type                object
Rating             float64
Courses offered      int64
Fee Start Range     object
Fee End Range       object
City                object
State               object
dtype: object

In [19]:
def convert_fee_to_float(fee):
    if isinstance(fee, str):
        fee = fee.replace(" K", "e3").replace(" L", "e5")
        return pd.to_numeric(fee, errors='coerce')
    return fee
df['Fee Start Range'] = df['Fee Start Range'].apply(convert_fee_to_float)
df['Fee End Range'] = df['Fee End Range'].apply(convert_fee_to_float)
df

Unnamed: 0,College Name,Type,Rating,Courses offered,Fee Start Range,Fee End Range,City,State
0,Andhra University,Public,3.6,34,16000.0,1444000.0,Visakhapatnam,Andhra Pradesh
1,KL University,Private,3.5,23,140000.0,1560000.0,Guntur,Andhra Pradesh
2,Sri Venkateswara University (SVU),Public,3.5,18,8000.0,271000.0,Tirupati,Andhra Pradesh
3,Aditya Engineering College,Private,3.7,5,54000.0,201000.0,East Godavari,Andhra Pradesh
4,GPREC,Private,3.9,3,114000.0,320000.0,Kurnool,Andhra Pradesh
...,...,...,...,...,...,...,...,...
427,"Aizawl Theological College, Aizawl",Private,0.0,3,45000.0,500000.0,Aizawl,Mizoram
428,Pachhunga University College,Public,3.3,3,28000.0,75000.0,Aizawl,Mizoram
429,Mizoram University,Public,3.5,7,10000.0,25000.0,Aizawl,Mizoram
430,NIT Mizoram,Public,0.0,3,30000.0,125000.0,Aizawl,Mizoram


In [20]:
df.to_csv('cleaned_colleges_data.csv', index=False)

In [22]:
df.to_excel('cleaned_colleges_data.xlsx', index=False)