# Scraping data from OERCOMMONS

In [3]:
# Importing necessary libraries for scraping

import requests
from bs4 import BeautifulSoup
from typing import Optional, List

# Helper Functions for Scraping

def get_element_text(element, index: int = 0, default: str = 'N/A') -> str:

    try:
        value = element.find_all('dd')[index].text.strip()
        return value if value else default
    except (AttributeError, IndexError):
        return default

def get_metadata_content(soup, meta_name: str, default: str = 'N/A') -> str:

    try:
        return soup.find('meta', {'itemprop': meta_name})['content'].strip()
    except (AttributeError, TypeError):
        return default

# Function to Fetch Course Data

def scrape_course_data(course, base_url: str) -> Optional[dict]:

    try:
        # Fetch course-specific information
        title = course.find('div', class_='item-title').text.strip()
        link = course.find('a', class_='item-link')['href']
        author = get_element_text(course, index=0)
        subject = get_element_text(course, index=1)
        material_type = get_element_text(course, index=2)
        description = course.find('div', class_='abstract').text.strip() if course.find('div', 'abstract') else 'N/A'

        # Fetch the lesson URL and parse the details
        lesson_url = base_url + link
        response = requests.get(lesson_url)
        response.raise_for_status()
        lesson_soup = BeautifulSoup(response.text, 'html.parser')

        keywords = get_metadata_content(lesson_soup, 'keywords')
        date_created = get_metadata_content(lesson_soup, 'dateCreated')
        resource_url = lesson_soup.find('a', {'id': 'goto'})['href'] if lesson_soup.find('a', {'id': 'goto'}) else 'N/A'

        return {
            "title": title,
            "author": author,
            "subject": subject,
            "material_type": material_type,
            "description": description,
            "keywords": keywords,
            "date_created": date_created,
            "resource_url": resource_url
        }
    except Exception as e:
        print(f"Error processing course data: {e}")
        return None

# Pydantic Validation and Data Handling

In [4]:
# Likewise, importing necesarry libraries for Pydantic work

from pydantic import BaseModel, HttpUrl, TypeAdapter
import json

# Define Pydantic Model for Lessons

class Lesson(BaseModel):

    title: str
    author: Optional[str]
    subject: str
    material_type: str
    description: Optional[str]
    keywords: Optional[str]
    date_created: Optional[str]
    resource_url: HttpUrl

# Function to Scrape All Lessons

def scrape_lessons(base_url: str, search_url: str) -> List[dict]:

    try:
        response = requests.get(search_url)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching search URL: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    courses = soup.find_all('article', class_='js-index-item')

    lesson_data = []

    for course in courses:
        lesson = scrape_course_data(course, base_url)
        if lesson:
            lesson_data.append(lesson)

    return lesson_data

# Function to Save Data to JSON

def save_data_to_json(data, file_name='lessons_sync.json'):

    with open(file_name, 'w') as f:

        # Convert HttpUrl fields to strings for serialization

        json_compatible_data = [lesson.dict() for lesson in data]
        for lesson in json_compatible_data:
            lesson["resource_url"] = str(lesson["resource_url"])
        json.dump(json_compatible_data, f, indent=4)
    print(f"Data saved to {file_name}")

# Creating json file loading function

In [10]:
# Load JSON Data from File

def load_json(file_name = 'lessons_sync.json'):

    """Load data from a JSON file."""

    try:
        with open(file_name, 'r') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"File '{file_name}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error decoding JSON from file '{file_name}'.")
        return None


# The function that brings it all together; the main ETL function

In [9]:
# Main ETL Function

def run_etl():

    base_url = 'https://oercommons.org'
    search_url = 'https://oercommons.org/search?batch_size=20&sort_by=visits&view_mode=summary&f.general_subject=english-language-arts&f.sublevel=lower-primary'

    # Extracting lessons

    print("Scraping lessons...")
    raw_lessons = scrape_lessons(base_url, search_url)

    # Validating data using Pydantic

    print("Validating scraped data with Pydantic...")
    try:
        type_adapter = TypeAdapter(List[Lesson])
        lessons = type_adapter.validate_python(raw_lessons)
    except Exception as e:
        print(f"Validation error: {e}")
        lessons = []

    # Saving data to JSON

    print("Saving data to JSON...")
    save_data_to_json(lessons)

# Execute the ETL process

run_etl()

Scraping lessons...
Validating scraped data with Pydantic...
Saving data to JSON...
Data saved to lessons_sync.json


In [11]:
# Loading the data from file after validation

lessons_data = load_json()

if lessons_data:

    print("Loaded lessons data:")
    print(lessons_data)


Loaded lessons data:
[{'title': '1st Grade "OR", "OAR", "ORE" Spelling Patterns', 'author': 'English Language Arts', 'subject': 'Lesson Plan', 'material_type': 'Molly Simpson', 'description': 'This Lesson Plan was created by Molly Simpson.\xa0The attached Lesson Plan is …\n                  \n\n\nThis Lesson Plan was created by Molly Simpson.\xa0The attached Lesson Plan is designed for Grade 1\xa0English Language Arts students. Students will be able to understand and spell the words using their knowledge of the various spellings of the same sound: "oar, "or", and "ore". This Lesson Plan can be used with small group instruction or with\xa0a whole class. This lesson plan adresses the following NDE Standards: .\xa0It is expected that this Lesson Plan will take students 20 to 80\xa0minutes to complete depending on the use of small group or large group instruction.\n\n\nMore\nLess', 'keywords': 'NE ELA', 'date_created': '2020-07-30T17:24:29.680357', 'resource_url': 'https://oercommons.org/c