### Get the placement data PDF files for each college

In [1]:
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

In [82]:
def get_pdf_link_from_td_tag(td_tag):
    a_tags = td_tag.find_elements(By.TAG_NAME, 'a')
    pdf_link = ''
    for a_tag in a_tags:
        attrbs = a_tag.get_attribute('href')
        if attrbs and '.pdf' in attrbs:
            pdf_link = attrbs
    return pdf_link

def get_valid_name(name):
    name = re.sub(r'[^A-Za-z0-9]', ' ', name)
    return re.sub(r' +', ' ', name)

In [83]:
def get_college_name_and_pdf(url, folder_path):
    if not url:
        return '', ''

    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(3)  

    try:
        div_element = driver.find_element(By.CSS_SELECTOR, 'div.dt-layout-cell.dt-layout-full')
        table_tag = div_element.find_element(By.TAG_NAME, 'table')

        rows = table_tag.find_elements(By.TAG_NAME, 'tr')

        for row_index, row in enumerate(rows[1:]):  
            tds = row.find_elements(By.TAG_NAME, 'td')
            if len(tds) < 2:
                continue

            college_id = tds[0].text.strip()
            college_name = tds[1].text.strip().split('More')[0].strip()
            pdf_link = get_pdf_link_from_td_tag(tds[1])

            if pdf_link:
                os.makedirs(folder_path, exist_ok=True)
                valid_name = get_valid_name(college_name)
                file_name = f"{valid_name}[{college_id}].pdf"
                file_path = os.path.join(folder_path, file_name)

                try:
                    response = requests.get(pdf_link)
                    response.raise_for_status()
                    with open(file_path, 'wb') as file:
                        file.write(response.content)
#                     print(f"Saved: {file_path}")
                except Exception as e:
                    print(f"Failed to download PDF for {college_name}: {e}")
        
    except Exception as e:
        print(f"Error occurred: {e}")
    finally:
        driver.quit()


### Get the Placement data for each college in One df for a particular year

In [2]:
import pdfplumber
import pandas as pd
import os
import requests

In [111]:
def get_pdf_table_data(file_path):
    pdf_table_data = []
    last_table = None
    last_columns = None 

    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            words = page.extract_words()

            for table in page.find_tables():
                if not table.rows or len(table.rows) < 2:
                    continue

                x0, top, x1, bottom = table.bbox

                title_words = [
                    w for w in words
                    if (top - 20 < w['bottom'] < top) and (x0 - 10 < w['x0'] < x1)]
                title_words.sort(key=lambda w: w['x0'])
                final_title = ' '.join(w['text'] for w in title_words).strip()

                table_data = table.extract()

                try:
                    # If it's a continuation (no title and low top position)
                    if top < 15 and not final_title and last_table is not None:
                        df = pd.DataFrame(table_data, columns=last_table.columns)
                        last_table = pd.concat([last_table, df], ignore_index=True)
                        pdf_table_data[-1]['table'] = last_table
#                         print(f"Page {page_num}: Appended continuation")
                    else:
                        columns = table_data[0]
                        data = table_data[1:]

                        clean_data = [row for row in data if len(row) == len(columns)]
                        if not clean_data:
                            raise ValueError(f"Table on Page {page_num} has no valid rows matching {len(columns)} columns")

                        df = pd.DataFrame(clean_data, columns=columns)

                        pdf_table_data.append({
                            'page_num': page_num,
                            'title': final_title,
                            'table': df
                        })
                        last_table = df
                        last_columns = columns
                except Exception as e:
#                     print(f"Page {page_num}: Could not process table: {e}")
                    pass

    return pdf_table_data

In [112]:
import re

def get_college_placement_data(table_data, college_name, college_id):
    placement_data = pd.DataFrame()
    for value in table_data:
        page, title, df = value['page_num'], value['title'], value['table']
        if 'placement & higher studies for previous 3 years' in title.lower():
            df = df.loc[:, ~df.columns.duplicated(keep = 'last')]
            df = df[df.columns[-5:]]
            df['College'] = college_name
            df['CollegeID'] = college_id
            df['Batch'] = title.split(':')[0].strip()
            df.columns = [col.replace('\n', ' ') for col in df.columns]
            
            if placement_data.empty:
                placement_data = df
            else:
                placement_data = pd.concat([placement_data, df], ignore_index = True)
    return placement_data

In [113]:
def get_placement_data_of_all_colleges(folder_path):
    try: 
        files = os.listdir(folder_path)
        files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]

        placement_data = pd.DataFrame()
        for pdf in files:
            file_path = os.path.join(folder_path, pdf)
            data_of_table = get_pdf_table_data(file_path)
            college_name = os.path.splitext(pdf)[0].split('[')[0]
            college_id = os.path.splitext(pdf)[0].split('[')[1][:-1]
            df = get_college_placement_data(data_of_table, college_name, college_id)

            if placement_data.shape == (0, 0):
                placement_data = df
            else:
                placement_data = pd.concat([placement_data, df], ignore_index= True)
    except Exception as E:
        print(f'Follwing Exception Occurred: {E}')
            
    return placement_data

### Get the Placement Data for 2020-2023

In [114]:
url_2024 = 'https://www.nirfindia.org/Rankings/2024/EngineeringRanking.html'
folder_path = 'Scraped_Data/nirf_placement_2024'
get_college_name_and_pdf(url_2024, folder_path)

In [115]:
df24 = get_placement_data_of_all_colleges(folder_path)
df24.drop_duplicates(subset = ['Academic Year', 'Batch', 'CollegeID'], inplace = True)
df24.head()

Unnamed: 0,Academic Year,No. of students graduating in minimum stipulated time,No. of students placed,Median salary of placed graduates per annum(Amount in Rs.),No. of students selected for Higher Studies,College,CollegeID,Batch
0,2020-21,771,618,1000000(Ten lakhs),74,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]
1,2021-22,869,639,1592000(Fifteen lakhs\nninety two thousand),62,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]
2,2022-23,1064,872,1500000(Fifteen\nLakhs),79,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]
3,2020-21,526,178,800000(Eight lakhs),51,National Institute of Technology Warangal,IR-E-U-0025,PG [2 Years Program(s)]
4,2021-22,542,336,1175000(Eleven lakhs\nseventy five thousand),45,National Institute of Technology Warangal,IR-E-U-0025,PG [2 Years Program(s)]


### Get the Placement Data for 2017-2020

In [116]:
url_2021 = 'https://www.nirfindia.org/Rankings/2021/EngineeringRanking.html'
folder_path = 'Scraped_Data/nirf_placement_2021'
get_college_name_and_pdf(url_2021, folder_path)

In [117]:
df21 = get_placement_data_of_all_colleges(folder_path)
df21.drop_duplicates(subset = ['Academic Year', 'Batch', 'CollegeID'], inplace = True)
df21.head()

Unnamed: 0,Academic Year,No. of students graduating in minimum stipulated time,No. of students placed,Median salary of placed graduates(Amount in Rs.),No. of students selected for Higher Studies,College,CollegeID,Batch
0,2017-18,248,158,741000(Seven Lakh\nForty One Thousand ),11,Pandit Dwarka Prasad Mishra Indian Institute o...,IR-E-U-0286,UG [4 Years Program(s)]
1,2018-19,291,173,750000(Seven Lakh\nFifty Thousand ),22,Pandit Dwarka Prasad Mishra Indian Institute o...,IR-E-U-0286,UG [4 Years Program(s)]
2,2019-20,262,146,943000(Nine Lakh\nForty Three Thousand\n),78,Pandit Dwarka Prasad Mishra Indian Institute o...,IR-E-U-0286,UG [4 Years Program(s)]
3,2017-18,84,29,725000(Seven Lakh\nTwenty Five Thousand\n),8,Pandit Dwarka Prasad Mishra Indian Institute o...,IR-E-U-0286,PG [2 Years Program(s)]
4,2018-19,96,35,710000(Seven Lakh\nTen Thousand ),20,Pandit Dwarka Prasad Mishra Indian Institute o...,IR-E-U-0286,PG [2 Years Program(s)]


### Combine both the data to get placement data for 2017-2023

In [120]:
df21.columns = df24.columns

In [121]:
all_colleges_data = pd.concat([df24, df21], ignore_index=True)

In [125]:
all_colleges_data.head()

Unnamed: 0,Academic Year,No. of students graduating in minimum stipulated time,No. of students placed,Median salary of placed graduates per annum(Amount in Rs.),No. of students selected for Higher Studies,College,CollegeID,Batch
0,2020-21,771,618,1000000(Ten lakhs),74,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]
1,2021-22,869,639,1592000(Fifteen lakhs\nninety two thousand),62,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]
2,2022-23,1064,872,1500000(Fifteen\nLakhs),79,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]
3,2020-21,526,178,800000(Eight lakhs),51,National Institute of Technology Warangal,IR-E-U-0025,PG [2 Years Program(s)]
4,2021-22,542,336,1175000(Eleven lakhs\nseventy five thousand),45,National Institute of Technology Warangal,IR-E-U-0025,PG [2 Years Program(s)]


In [126]:
all_colleges_data.to_csv('Placement_data_from_2017_to_2023.csv', index = False)

### Get NIRF Ranking Data For each college

In [160]:
def get_nirf_data(url, file_path):
    if not url:
        return '', ''

    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(3)  

    try:
        div_element = driver.find_element(By.CSS_SELECTOR, 'div.dt-layout-cell.dt-layout-full')
        table_tag = div_element.find_element(By.TAG_NAME, 'table')

        rows = table_tag.find_elements(By.TAG_NAME, 'tr')
        full_table_data = []
        for row_index, row in enumerate(rows[1:]):  
            tds = row.find_elements(By.TAG_NAME, 'td')
            if len(tds) == 0:
                continue

            columns = [col.text for i, col in enumerate(tds)]
            full_table_data.append(columns)
        
        df = pd.DataFrame(full_table_data)
        
        df.to_csv(file_path, index = False)
        
    except Exception as e:
        print(f"Error occurred: {e}")
    finally:
        driver.quit()

#### For 2024

In [192]:
get_nirf_data(url_2024, 'Scraped_Data/nirf_data_2024.csv')

In [193]:
df = pd.read_csv('Scraped_Data/nirf_data_2024.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,IR-E-U-0456,Indian Institute of Technology Madras\nMore De...,,,,,,Chennai,Tamil Nadu,89.46,1.0
1,,,,,,,,,,,
2,IR-E-I-1074,Indian Institute of Technology Delhi\nMore Det...,,,,,,New Delhi,Delhi,86.66,2.0
3,,,,,,,,,,,
4,IR-E-U-0306,Indian Institute of Technology Bombay\nMore De...,,,,,,Mumbai,Maharashtra,83.09,3.0


In [194]:
df.drop(['2', '3', '4', '5', '6'], axis = 1, inplace = True)
df.dropna(inplace = True)

In [195]:
df.columns = ['CollegeID', 'College_name', 'City','State','Score', 'Rank']
df['College_name'] = df['College_name'].apply(lambda x: x.split('\n')[0].strip())
df.to_csv('Scraped_Data/nirf_data_2024.csv', index = False)
df.head()

Unnamed: 0,CollegeID,College_name,City,State,Score,Rank
0,IR-E-U-0456,Indian Institute of Technology Madras,Chennai,Tamil Nadu,89.46,1.0
2,IR-E-I-1074,Indian Institute of Technology Delhi,New Delhi,Delhi,86.66,2.0
4,IR-E-U-0306,Indian Institute of Technology Bombay,Mumbai,Maharashtra,83.09,3.0
6,IR-E-I-1075,Indian Institute of Technology Kanpur,Kanpur,Uttar Pradesh,82.79,4.0
8,IR-E-U-0573,Indian Institute of Technology Kharagpur,Kharagpur,West Bengal,76.88,5.0


#### For 2021

In [196]:
get_nirf_data(url_2021, 'Scraped_Data/nirf_data_2021.csv')

In [197]:
df1 = pd.read_csv('Scraped_Data/nirf_data_2021.csv')
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,IR-E-U-0456,Indian Institute of Technology Madras\nMore De...,,,,,,Chennai,Tamil Nadu,90.19,1.0
1,,,,,,,,,,,
2,IR-E-I-1074,Indian Institute of Technology Delhi\nMore Det...,,,,,,New Delhi,Delhi,88.96,2.0
3,,,,,,,,,,,
4,IR-E-U-0306,Indian Institute of Technology Bombay\nMore De...,,,,,,Mumbai,Maharashtra,85.16,3.0


In [198]:
df1.drop(['2', '3', '4', '5', '6'], axis = 1, inplace = True)
df1.dropna(inplace = True)

In [199]:
df1.columns = ['CollegeID', 'College_name', 'City','State','Score', 'Rank']
df1['College_name'] = df1['College_name'].apply(lambda x: x.split('\n')[0].strip())
df1.to_csv('Scraped_Data/nirf_data_2021.csv', index = False)
df1.head()

Unnamed: 0,CollegeID,College_name,City,State,Score,Rank
0,IR-E-U-0456,Indian Institute of Technology Madras,Chennai,Tamil Nadu,90.19,1.0
2,IR-E-I-1074,Indian Institute of Technology Delhi,New Delhi,Delhi,88.96,2.0
4,IR-E-U-0306,Indian Institute of Technology Bombay,Mumbai,Maharashtra,85.16,3.0
6,IR-E-I-1075,Indian Institute of Technology Kanpur,Kanpur,Uttar Pradesh,83.22,4.0
8,IR-E-U-0573,Indian Institute of Technology Kharagpur,Kharagpur,West Bengal,82.03,5.0


In [201]:
combined_df = pd.concat([df, df1], ignore_index=True)
unique_df = combined_df.drop_duplicates(subset=['CollegeID'], keep='first')

In [203]:
unique_df

Unnamed: 0,CollegeID,College_name,City,State,Score,Rank
0,IR-E-U-0456,Indian Institute of Technology Madras,Chennai,Tamil Nadu,89.46,1.0
1,IR-E-I-1074,Indian Institute of Technology Delhi,New Delhi,Delhi,86.66,2.0
2,IR-E-U-0306,Indian Institute of Technology Bombay,Mumbai,Maharashtra,83.09,3.0
3,IR-E-I-1075,Indian Institute of Technology Kanpur,Kanpur,Uttar Pradesh,82.79,4.0
4,IR-E-U-0573,Indian Institute of Technology Kharagpur,Kharagpur,West Bengal,76.88,5.0
...,...,...,...,...,...,...
194,IR-E-C-43708,College of Engineering Trivandrum,Thiruvananthapuram,Kerala,39.40,95.0
195,IR-E-C-35417,Bharati Vidyapeeth Deemed University College o...,Pune,Maharashtra,39.39,96.0
196,IR-E-U-0163,The Northcap University,Gurugram,Haryana,39.23,97.0
197,IR-E-C-1262,B.M.S. College of Engineering,Bengaluru,Karnataka,39.13,98.0


### Cleaning placement data 

In [3]:
placement_df = pd.read_csv('Placement_data_from_2017_to_2023.csv')

In [4]:
placement_df.head(2)

Unnamed: 0,Academic Year,No. of students graduating in minimum stipulated time,No. of students placed,Median salary of placed graduates per annum(Amount in Rs.),No. of students selected for Higher Studies,College,CollegeID,Batch
0,2020-21,771,618,1000000(Ten lakhs),74,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]
1,2021-22,869,639,1592000(Fifteen lakhs\nninety two thousand),62,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]


In [5]:
placement_df.columns = ['Year Placed', 'Total Graduating students', 'Students placed', 'Median salary per annum(Rs.)', 'Higher Studies Student', 'College', 'CollegeID', 'Batch']

In [6]:
placement_df.head(2)

Unnamed: 0,Year Placed,Total Graduating students,Students placed,Median salary per annum(Rs.),Higher Studies Student,College,CollegeID,Batch
0,2020-21,771,618,1000000(Ten lakhs),74,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]
1,2021-22,869,639,1592000(Fifteen lakhs\nninety two thousand),62,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]


In [7]:
placement_df['Year Placed'].unique()

array(['2020-21', '2021-22', '2022-23', 'Academic Year', '2017-18',
       '2018-19', '2019-20'], dtype=object)

In [8]:
placement_df = placement_df[placement_df['Year Placed'] != 'Academic Year']

In [9]:
placement_df['Year Placed'] = placement_df['Year Placed'].apply(lambda x: int('20'+x.split('-')[-1]))

In [10]:
placement_df.head(1)

Unnamed: 0,Year Placed,Total Graduating students,Students placed,Median salary per annum(Rs.),Higher Studies Student,College,CollegeID,Batch
0,2021,771,618,1000000(Ten lakhs),74,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]


In [11]:
placement_df['Median salary per annum(Rs.)'] = placement_df['Median salary per annum(Rs.)'].apply(lambda x: x.split('(')[0])

In [12]:
placement_df['Median salary per annum(Rs.)'] = placement_df['Median salary per annum(Rs.)'].astype(int)

In [13]:
placement_df.head(1)

Unnamed: 0,Year Placed,Total Graduating students,Students placed,Median salary per annum(Rs.),Higher Studies Student,College,CollegeID,Batch
0,2021,771,618,1000000,74,National Institute of Technology Warangal,IR-E-U-0025,UG [4 Years Program(s)]


In [14]:
'UG [4 Years Program(s)]'.split('[')[0] + 'UG [4 Years Program(s)]'.split('[')[1][:1]

'UG 4'

In [15]:
def parse_batch_name(name):
    p = name.split('[')
    return f'{p[0].strip()} {p[1][:1].strip()}'

In [16]:
placement_df['Batch'] = placement_df['Batch'].apply(parse_batch_name)

In [17]:
placement_df.head(2)

Unnamed: 0,Year Placed,Total Graduating students,Students placed,Median salary per annum(Rs.),Higher Studies Student,College,CollegeID,Batch
0,2021,771,618,1000000,74,National Institute of Technology Warangal,IR-E-U-0025,UG 4
1,2022,869,639,1592000,62,National Institute of Technology Warangal,IR-E-U-0025,UG 4


In [18]:
placement_df['Program'] = placement_df['Batch'].apply(lambda x: x.split(' ')[0])

In [19]:
placement_df['Duration(Years)'] = placement_df['Batch'].apply(lambda x: x.split(' ')[1])

In [20]:
placement_df['Duration(Years)'] = placement_df['Duration(Years)'].astype(int)

In [21]:
placement_df.drop('Batch', axis = 1, inplace = True)

In [26]:
placement_df['Year Placed'] = placement_df['Year Placed'].astype(int)

In [27]:
placement_df['Total Graduating students'] = placement_df['Total Graduating students'].astype(int)

In [28]:
placement_df['Higher Studies Student'] = placement_df['Higher Studies Student'].astype(int)

In [30]:
placement_df['Students placed'] = placement_df['Students placed'].astype(int)

In [32]:
placement_df.head()

Unnamed: 0,Year Placed,Total Graduating students,Students placed,Median salary per annum(Rs.),Higher Studies Student,College,CollegeID,Program,Duration(Years)
0,2021,771,618,1000000,74,National Institute of Technology Warangal,IR-E-U-0025,UG,4
1,2022,869,639,1592000,62,National Institute of Technology Warangal,IR-E-U-0025,UG,4
2,2023,1064,872,1500000,79,National Institute of Technology Warangal,IR-E-U-0025,UG,4
3,2021,526,178,800000,51,National Institute of Technology Warangal,IR-E-U-0025,PG,2
4,2022,542,336,1175000,45,National Institute of Technology Warangal,IR-E-U-0025,PG,2


In [33]:
placement_df.to_csv('Cleaned_Placement_data_from_2017_to_2023.csv', index = False)

### Getting NIRF Data of all Colleges

In [71]:
df1 = pd.read_csv('Scraped_Data/nirf_data_2024.csv')

In [72]:
df2 = pd.read_csv('Scraped_Data/nirf_data_2021.csv')

In [73]:
df1.head(2)

Unnamed: 0,CollegeID,College_name,City,State,Score,Rank
0,IR-E-U-0456,Indian Institute of Technology Madras,Chennai,Tamil Nadu,89.46,1.0
1,IR-E-I-1074,Indian Institute of Technology Delhi,New Delhi,Delhi,86.66,2.0


In [74]:
df2.head(2)

Unnamed: 0,CollegeID,College_name,City,State,Score,Rank
0,IR-E-U-0456,Indian Institute of Technology Madras,Chennai,Tamil Nadu,90.19,1.0
1,IR-E-I-1074,Indian Institute of Technology Delhi,New Delhi,Delhi,88.96,2.0


In [75]:
all_college_nirf_data = pd.concat([df1, df2], ignore_index= True)

In [76]:
all_college_nirf_data.drop_duplicates(subset = ['CollegeID'], inplace = True)

In [77]:
college_info = all_college_nirf_data.drop(['College_name', 'Score', 'Rank'], axis = 1)

In [78]:
final_placement_data = pd.merge(placement_df, college_info, on='CollegeID', how = 'inner')

In [79]:
final_placement_data

Unnamed: 0,Year Placed,Total Graduating students,Students placed,Median salary per annum(Rs.),Higher Studies Student,College,CollegeID,Program,Duration(Years),City,State
0,2021,771,618,1000000,74,National Institute of Technology Warangal,IR-E-U-0025,UG,4,Warangal,Telangana
1,2022,869,639,1592000,62,National Institute of Technology Warangal,IR-E-U-0025,UG,4,Warangal,Telangana
2,2023,1064,872,1500000,79,National Institute of Technology Warangal,IR-E-U-0025,UG,4,Warangal,Telangana
3,2021,526,178,800000,51,National Institute of Technology Warangal,IR-E-U-0025,PG,2,Warangal,Telangana
4,2022,542,336,1175000,45,National Institute of Technology Warangal,IR-E-U-0025,PG,2,Warangal,Telangana
...,...,...,...,...,...,...,...,...,...,...,...
1286,2019,165,29,587000,16,National Institute of Technology Hamirpur,IR-E-U-0189,PG,2,Hamirpur,Himachal Pradesh
1287,2020,173,17,580000,28,National Institute of Technology Hamirpur,IR-E-U-0189,PG,2,Hamirpur,Himachal Pradesh
1288,2018,0,0,0,0,National Institute of Technology Hamirpur,IR-E-U-0189,PG-Integrated,5,Hamirpur,Himachal Pradesh
1289,2019,58,45,681000,4,National Institute of Technology Hamirpur,IR-E-U-0189,PG-Integrated,5,Hamirpur,Himachal Pradesh


In [83]:
final_placement_data = final_placement_data[['CollegeID', 'College', 'City', 'State', 'Program', 'Duration(Years)', 'Total Graduating students', 'Year Placed', 'Students placed','Median salary per annum(Rs.)', 'Higher Studies Student']]

In [85]:
final_placement_data.head()

Unnamed: 0,CollegeID,College,City,State,Program,Duration(Years),Total Graduating students,Year Placed,Students placed,Median salary per annum(Rs.),Higher Studies Student
0,IR-E-U-0025,National Institute of Technology Warangal,Warangal,Telangana,UG,4,771,2021,618,1000000,74
1,IR-E-U-0025,National Institute of Technology Warangal,Warangal,Telangana,UG,4,869,2022,639,1592000,62
2,IR-E-U-0025,National Institute of Technology Warangal,Warangal,Telangana,UG,4,1064,2023,872,1500000,79
3,IR-E-U-0025,National Institute of Technology Warangal,Warangal,Telangana,PG,2,526,2021,178,800000,51
4,IR-E-U-0025,National Institute of Technology Warangal,Warangal,Telangana,PG,2,542,2022,336,1175000,45


In [86]:
final_placement_data.to_csv('final_placement_dataset.csv', index = False)

In [141]:
df = pd.read_csv('final_placement_dataset.csv')

### Handling the double college name

There are some colleges whose names have changed with time

In [142]:
ids = {}
for i in df.groupby('CollegeID'):
    college_id = i[0]
    colleges = i[1]['College'].unique()
    
    college_name = ''
    if len(colleges) > 1:
        if len(colleges[0]) > len(colleges[1]):
            college_name = colleges[0]
        else:
            college_name = colleges[1]
    if college_name:
        ids[college_id] = college_name

In [143]:
double_names = pd.Series(ids)

In [144]:
double_names

IR-E-C-24004                         AU College of Engineering A 
IR-E-C-41593                        COEP Technological University
IR-E-I-1480     Thapar Institute of Engineering and Technology...
IR-E-U-0105     Indraprastha Institute of Information Technolo...
IR-E-U-0205     Indian Institute of Technology Indian School o...
IR-E-U-0237     National Institute of Technology Karnataka Sur...
IR-E-U-0334     Visvesvaraya National Institute of Technology ...
IR-E-U-0374     Dr B R Ambedkar National Institute of Technolo...
IR-E-U-0391      Birla Institute of Technology and Science Pilani
IR-E-U-0476     Shanmugha Arts Science Technology and Research...
IR-E-U-0564            University of Petroleum and Energy Studies
IR-E-U-0584     Indian Institute of Engineering Science and Te...
dtype: object

In [145]:
for i, data in df.iterrows():
    if data['CollegeID'] in set(double_names.index):
        df.loc[i, 'College'] = double_names.loc[data['CollegeID']]

In [147]:
df.head()

Unnamed: 0,CollegeID,College,City,State,Program,Duration(Years),Total Graduating students,Year Placed,Students placed,Median salary per annum(Rs.),Higher Studies Student
0,IR-E-U-0025,National Institute of Technology Warangal,Warangal,Telangana,UG,4,771,2021,618,1000000,74
1,IR-E-U-0025,National Institute of Technology Warangal,Warangal,Telangana,UG,4,869,2022,639,1592000,62
2,IR-E-U-0025,National Institute of Technology Warangal,Warangal,Telangana,UG,4,1064,2023,872,1500000,79
3,IR-E-U-0025,National Institute of Technology Warangal,Warangal,Telangana,PG,2,526,2021,178,800000,51
4,IR-E-U-0025,National Institute of Technology Warangal,Warangal,Telangana,PG,2,542,2022,336,1175000,45


In [148]:
df.to_csv('final_placement_dataset.csv', index = False)