In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

In [39]:
chrome_options = webdriver.ChromeOptions()
# Uncomment to run without the browser UI for faster scraping in production
chrome_options.add_argument("--headless")
service = Service('/usr/local/bin/chromedriver')  # Adjust path as needed
driver = webdriver.Chrome(service=service, options=chrome_options)

topics = [
    "Engineering", "Science", "Mathematics", "Computer Science", "Physics",
    "Mechanical Engineering", "Systems Engineering", "Electrical Engineering",
    "Differential Equations", "Probability and Statistics", "Linear Algebra", 
    "Biology", "Calculus", "Earth Science", "Materials Science and Engineering", 
    "Applied Mathematics", "Chemistry", "Algorithms and Data Structures", 
    "Thermodynamics", "Cognitive Science", "Quantum Mechanics", "Fluid Mechanics",
    "Electromagnetism", "Software Design and Engineering", "Discrete Mathematics",
    "Biological Engineering", "Chemical Engineering", "Computational Modeling and Simulation", 
    "Classical Mechanics", "Artificial Intelligence", "Mechanical Design", "Electronics",
    "Theory of Computation", "Digital Systems", "Energy", "Ocean Engineering",
    "Signal Processing", "Computer Design and Engineering", "Robotics and Control Systems", 
    "Programming Languages", "Solid Mechanics", "Civil Engineering", "Computation", 
    "Digital Media", "Systems Optimization", "Geophysics", "Algebra and Number Theory", 
    "Atomic, Molecular, Optical Physics", "Mathematical Analysis", "Physical Chemistry",
    "Data Mining", "Hydrodynamics", "Environmental Engineering", "Information Technology",
    "Systems Design", "Topology and Geometry", "Neuroscience", "Nanotechnology",
    "Geology", "Propulsion Systems", "Telecommunications", "Materials Selection", 
    "Oceanography", "Molecular Biology", "Genetics", "Transport Processes", 
    "Nuclear Engineering", "Atmospheric Science", "Nuclear Physics", "Computer Networks", 
    "Dynamics and Control", "Biochemistry", "Biomaterials", "Environmental Science", 
    "Cell Biology", "Graphics and Visualization", "Particle Physics", 
    "Computational Science and Engineering", "Theoretical Physics", 
    "Computation and Systems Biology", "Organic Chemistry", "Neurobiology",
    "Structural Engineering", "Game Design", "Computational Biology", "Game Theory", 
    "Structural Mechanics", "Anatomy and Physiology", "Biomedicine", "Planetary Science",
    "Electronic Materials", "Technology", "Electricity", "Nuclear", "Sensory-Neural Systems", 
    "Econometrics", "Human-Computer Interfaces", "Microtechnology", "Hydrology and Water Resource Systems", 
    "Cell and Tissue Engineering", "Analytical Chemistry", "Numerical Simulation",
    "Biophysics", "Geography", "Aquatic Sciences and Water Quality Control", 
    "Cryptography", "Aerodynamics", "Condensed Matter Physics", "Medical Imaging", 
    "Spectroscopy", "Climate", "Ecology", "Astrophysics", "Metallurgical Engineering", 
    "Hydrodynamics and Coastal Engineering", "Fossil Fuels", "Ocean Structures", 
    "Electric Power", "Transportation Engineering", "Functional Genomics", 
    "Biomedical Signal and Image Processing", "Biotechnology", "Synthetic Biology",
    "Architectural Engineering", "Microbiology", "Polymeric Materials", "Inorganic Chemistry", 
    "Graphic Design", "Biomechanics", "Astrodynamics", "Pharmacology and Toxicology", 
    "Polymers", "High Energy Physics", "Nuclear Materials", "Pathology and Pathophysiology", 
    "Ocean Exploration", "Mathematical Logic", "Hydrogen and Alternatives", "Biostatistics", 
    "Geobiology", "Virology", "Structural Biology", "Geotechnical Engineering", 
    "Molecular Engineering", "Bioastronautics", "Epidemiology", "Radiological Engineering", 
    "Cellular and Molecular Medicine", "Immunology", "Health and Exercise Science", 
    "Composite Materials", "Stem Cells"
]

# Initialize dictionary to store all topics and their corresponding data
all_data = {}

# Loop through each topic and scrape data
for topic in topics:
    print(f"Processing topic: {topic}")
    # Construct the URL with the current topic
    url = f"https://ocw.mit.edu/search/?r=Lecture%20Videos&t={topic.replace(' ', '%20')}&type=resourcefile&u=compact"
    driver.get(url)
    time.sleep(2)  # Initial wait for the page to load

    # Track last scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    topic_data = []
    seen_courses = set()  # To track unique courses by (resource_course_title, title)

    # Infinite scroll loop
    while True:
        # Scroll to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for the new content to load

        # Slightly scroll up to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight - 500);")
        time.sleep(1)

        # Collect course elements
        course_blocks = driver.find_elements(By.CLASS_NAME, "card-contents")

        # Extract data from each course block
        for course_block in course_blocks:
            # Check for video content by finding 'cover-image-video' class
            if len(course_block.find_elements(By.CLASS_NAME, "cover-image-video")) > 0:
                try:
                    # Extract resource course title
                    resource_course_title_element = course_block.find_element(By.CLASS_NAME, "resource-course-title").find_element(By.TAG_NAME, "a")
                    resource_course_title = resource_course_title_element.text
                except NoSuchElementException:
                    resource_course_title = None

                try:
                    # Extract the lecture title
                    title_element = course_block.find_element(By.CLASS_NAME, "resource-title").find_element(By.TAG_NAME, "a")
                    title = title_element.text
                except NoSuchElementException:
                    title = None

                # Add entry to topic_data if both fields are available and unique
                if resource_course_title and title and (resource_course_title, title) not in seen_courses:
                    seen_courses.add((resource_course_title, title))
                    topic_data.append({
                        "resource_course_title": resource_course_title,
                        "title": title
                    })

        # Check if we reached the bottom of the page
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            print(f"Reached end of results for {topic}.")
            break
        last_height = new_height  # Update the last height for the next scroll

    # Store topic data in the main dictionary
    all_data[topic] = topic_data

# Close the driver
driver.quit()

# Save the collected data to a JSON file
with open("lecture_videos_data.json", "w", encoding="utf-8") as file:
    json.dump(all_data, file, ensure_ascii=False, indent=4)

print("Data saved to lecture_videos_data.json")

Processing topic: Engineering
Reached end of results for Engineering.
Processing topic: Science
Reached end of results for Science.
Processing topic: Mathematics
Reached end of results for Mathematics.
Processing topic: Computer Science
Reached end of results for Computer Science.
Processing topic: Physics
Reached end of results for Physics.
Processing topic: Mechanical Engineering
Reached end of results for Mechanical Engineering.
Processing topic: Systems Engineering
Reached end of results for Systems Engineering.
Processing topic: Electrical Engineering
Reached end of results for Electrical Engineering.
Processing topic: Differential Equations
Reached end of results for Differential Equations.
Processing topic: Probability and Statistics
Reached end of results for Probability and Statistics.
Processing topic: Linear Algebra
Reached end of results for Linear Algebra.
Processing topic: Biology
Reached end of results for Biology.
Processing topic: Calculus
Reached end of results for Ca