In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

url = "https://itc.gymkhana.iitb.ac.in/wncc/soc/"


In [3]:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

project_urls = soup.find_all("div", class_="col-lg-4 col-6 mb-4 shuffle-item")
project_urls = ["https://itc.gymkhana.iitb.ac.in" +
                project_detail.find("a").get("href") for project_detail in project_urls]

project_names = soup.find_all(
    "p", class_="lead text-center font-weight-bold text-dark")
project_names = [project_name.text for project_name in project_names]

print(project_names)
print(project_urls)


['TEXT SUMMARIZATION WEB APP', 'Competitive Programming', 'Write yourself a Git!', 'File Compression System', 'FAST-G', 'Developing Trading Strategy with Pine Script', 'Real time Driver Drowsiness detection System', 'The Image Cartoonifier', 'Speech to Speech Translation', 'Competitive Programming - Newbie to Master', 'Path-Planning of Swarm Robotics in 2/3D space', 'Image Super Resolution using Deep Neural Networks', 'Enhance Low Resolution Image using GANs', 'MyBox', 'Deep Carlsen', 'InstiExchange - A web marketplace for IITB', 'Homomorphic Encryption for k-NN on the Cloud', 'TRayCer', 'Social media website with MERN', 'Dive into Digital Image Processing', 'Neural Quest', 'To the Quantum Future', 'Street Fighter II - Reinforcement Learning', 'Combinatorial Computing', 'Navigating the Waters of AI', 'Autonomous Driving Vehicle', 'Author Identification through Stylometric Analysis', 'Breakout Genius - Using RL to Build an AI Game Master', 'Image Captioning', 'Cricbuzz', 'Competitive Pr

In [7]:
project_info = {}

print(len(project_names))

for i in range(len(project_names)):
    project_info[project_names[i]] = {}
    project_info[project_names[i]]["project_url"] = project_urls[i]
    response = requests.get(project_urls[i])
    soup = BeautifulSoup(response.content, "html.parser")
    project_info[project_names[i]]["mentors"] = [mentor.text for mentor in soup.find(
        "h4", class_="display3").find_next_sibling("ul").find_all("li")]
    
    timeline = soup.find("table", class_="table-striped")
    weeks = []
    work = []

    for row in timeline.find_all("tr"):
        if row.find_all("td") == []:
            continue
        weeks.append(row.find_all("td")[0].text)
        work.append(row.find_all("td")[1].text)

    df = pd.DataFrame({"Week": weeks, "Work": work})

    project_info[project_names[i]]["timeline"] = df.to_dict("records")
    project_info[project_names[i]]["prerequisites"] = re.search(r"Prerequisites:\s*(.*?)<", str(soup), re.I)
    if project_info[project_names[i]]["prerequisites"] != None:
        project_info[project_names[i]]["prerequisites"] = project_info[project_names[i]]["prerequisites"].group(1)
    else:
        project_info[project_names[i]]["prerequisites"] = "N/A"
    

project_info_df = pd.DataFrame(project_info)

print(project_info_df)


71
                                      TEXT SUMMARIZATION WEB APP  \
project_url    https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...   
mentors              [kundeshwar vijay pundalik, ADRESH ALAGADE]   
timeline       [{'Week': 'Week 1-3', 'Work': 'EDA and Some ot...   
prerequisites  BASIC PYTHON , (HTML OR STREAMLIT) , BASIC KNO...   

                                         Competitive Programming  \
project_url    https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...   
mentors                                         [Virendra Kabra]   
timeline       [{'Week': 'Week 1', 'Work': 'C++ STL Basics'},...   
prerequisites                                                N/A   

                                           Write yourself a Git!  \
project_url    https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...   
mentors                                  [Abhijeet Prasad Bodas]   
timeline       [{'Week': 'Week 1', 'Work': 'Install a GNU/Lin...   
prerequisites                              

In [8]:
# Validate and clean data
for name, info in project_info.items():
    if "timeline" in info:
        for i, week_info in enumerate(info["timeline"]):
            if "week" not in week_info:
                print(f"Missing week information for project {name}, timeline entry {i}. Setting week to 0.")
                week_info["week"] = 0
            if "description" not in week_info:
                print(f"Missing description for project {name}, timeline entry {i}. Setting description to ''.")
                week_info["description"] = ""
            elif not isinstance(week_info["week"], int):
                print(f"Invalid week value for project {name}, timeline entry {i}. Converting to int.")
                week_info["week"] = int(week_info["week"])
            week_info["description"] = week_info["description"].strip()
    else:
        print(f"Missing timeline information for project {name}. Adding empty timeline.")
        project_info[name]["timeline"] = []

project_info_df = pd.DataFrame(project_info)

Missing week information for project TEXT SUMMARIZATION WEB APP, timeline entry 0. Setting week to 0.
Missing description for project TEXT SUMMARIZATION WEB APP, timeline entry 0. Setting description to ''.
Missing week information for project TEXT SUMMARIZATION WEB APP, timeline entry 1. Setting week to 0.
Missing description for project TEXT SUMMARIZATION WEB APP, timeline entry 1. Setting description to ''.
Missing week information for project TEXT SUMMARIZATION WEB APP, timeline entry 2. Setting week to 0.
Missing description for project TEXT SUMMARIZATION WEB APP, timeline entry 2. Setting description to ''.
Missing week information for project TEXT SUMMARIZATION WEB APP, timeline entry 3. Setting week to 0.
Missing description for project TEXT SUMMARIZATION WEB APP, timeline entry 3. Setting description to ''.
Missing week information for project Competitive Programming, timeline entry 0. Setting week to 0.
Missing description for project Competitive Programming, timeline entry 0

In [9]:
project_info_df.to_csv("project_info.csv", index=False)

In [11]:
project_info_df

Unnamed: 0,TEXT SUMMARIZATION WEB APP,Competitive Programming,Write yourself a Git!,File Compression System,FAST-G,Developing Trading Strategy with Pine Script,Real time Driver Drowsiness detection System,The Image Cartoonifier,Speech to Speech Translation,Competitive Programming - Newbie to Master,...,Institute OnChain Voting System with ZKPs,Using Deep RL and NLP to allocate stocks in portfolio,Blockchain Development- It's not that difficult!,Light field imaging and Dual Attention Networks,Stable Diffusion,FlappeRL,Hands-on Computational Physics,Image Colorization,PaperPal,JobFinderX
project_url,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...,https://itc.gymkhana.iitb.ac.in/wncc/soc/proje...
mentors,"[kundeshwar vijay pundalik, ADRESH ALAGADE]",[Virendra Kabra],[Abhijeet Prasad Bodas],[Amritaansh Narain],"[Ameya Vikram Singh, Prerak Contractor, Anish ...","[Vivek Kumar, Rajik Kumar]",[Vedang Bale],"[Jaideep Chandra, Sridhar, Bhavani Sankar]",[Swapnoneel Kayal],"[Parth Dwivedi, Kunal Kundwani]",...,"[Nikhil Tiwari, Ananya Khandelwal, Ishit Garg]","[Nikhil Tiwari, Ishan Jain (200100073)]","[Pragyesh Gupta, Dhruvkumar Patil (200100056),...","[Abeer Mishra, Shreyas Patil (200260052)]","[Shubham Hazra, Om Godage (21d100006), Kartik ...","[Kartik Gokhale, Hastyn Doshi]","[Kaustav Prasada, Varad Mahashabde (200260057)]",[Valay Bundele],"[Vishruth N, Abhinav Raghuvanshi (200040008)]","[Baggam Rakshan Tej (210070019), Pamba Ravindr..."
timeline,"[{'Week': 'Week 1-3', 'Work': 'EDA and Some ot...","[{'Week': 'Week 1', 'Work': 'C++ STL Basics', ...","[{'Week': 'Week 1', 'Work': 'Install a GNU/Lin...","[{'Week': 'Week 1&2', 'Work': 'Basic overview ...","[{'Week': 'Week 1', 'Work': 'Basics of graphs ...","[{'Week': 'Week 1-3', 'Work': 'Introduction to...","[{'Week': 'Week 1', 'Work': 'Python Programmin...","[{'Week': 'Week 1', 'Work': 'Introduction to P...","[{'Week': 'Week 1', 'Work': 'Brush up on / Pyt...","[{'Week': 'Week 1', 'Work': 'Start out with ba...",...,"[{'Week': 'Week 1', 'Work': 'Gain basic knowle...","[{'Week': 'Week 1', 'Work': 'Read basics of de...","[{'Week': 'Week 1', 'Work': ' Introduction to ...","[{'Week': 'Week 1-2', 'Work': ' Basics of Mach...","[{'Week': 'Week 1', 'Work': 'Basics of Regress...","[{'Week': 'Week 1', 'Work': 'Catching up on Py...","[{'Week': 'Week 1-2', 'Work': '- Introduction ...","[{'Week': 'Week 1-2', 'Work': 'Read about the ...","[{'Week': 'Week 1', 'Work': 'Set up project re...","[{'Week': 'Week 1', 'Work': 'HTML, CSS', 'week..."
prerequisites,"BASIC PYTHON , (HTML OR STREAMLIT) , BASIC KNO...",,,Assume you have basic understanding of C++ and...,,,,,,,...,,,,,,,,,,
