## 1. Installing required libraries



In [1]:
%pip install beautifulsoup4 requests pydantic


Defaulting to user installation because normal site-packages is not writeable


## 2 . importing


In [2]:
import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel, HttpUrl
from typing import Optional
import json



## 3. Using pydantic BaseModel

In [3]:

class Lesson(BaseModel):
    title: str
    author: Optional[str]
    subject: str
    material_type: str
    description: Optional[str]
    license_info: Optional[str]
    keywords: Optional[str]
    date_created: Optional[str]
    resource_url: HttpUrl

    def to_serializable(self):
        
        data = self.dict()
        data['resource_url'] = str(self.resource_url)  # Ensure the URL is converted to string
        return data


## 4. Using Beautiful Soup for scraping data from Oercommons

In [4]:
base_url = 'https://oercommons.org'
search_url = 'https://oercommons.org/search?batch_size=20&sort_by=visits&view_mode=summary&f.general_subject=english-language-arts&f.sublevel=lower-primary'

def scrape_lessons():
    response = requests.get(search_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    courses = soup.find_all('article', class_='js-index-item')
    lesson_data = []

    for course in courses:
        try:
            # Extract lesson details
            title = course.find('div', class_='item-title').text.strip()
            link = course.find('a', class_='item-link')['href']
            author = course.find('dl').find_all('dd')[0].text.strip() if course.find('dl') else 'N/A'
            subject = course.find('dl').find_all('dd')[1].text.strip() if course.find('dl') else 'N/A'
            material_type = course.find('dl').find_all('dd')[2].text.strip() if course.find('dl') else 'N/A'
            description = course.find('div', class_='abstract').text.strip() if course.find('div', 'abstract') else 'N/A'
            license_info = course.find('div', class_='cou-bucket').text.strip() if course.find('div', 'cou-bucket') else 'N/A'

            lesson_url = base_url + link
            lesson_response = requests.get(lesson_url)
            lesson_soup = BeautifulSoup(lesson_response.content, 'html.parser')

            keywords = lesson_soup.find('meta', {'itemprop': 'keywords'})['content'] if lesson_soup.find('meta', {'itemprop': 'keywords'}) else 'N/A'
            date_created = lesson_soup.find('meta', {'itemprop': 'dateCreated'})['content'] if lesson_soup.find('meta', {'itemprop': 'dateCreated'}) else 'N/A'
            resource_url = lesson_soup.find('a', {'id': 'goto'})['href'] if lesson_soup.find('a', {'id': 'goto'}) else 'N/A'

            # Create lesson object using Pydantic validation
            lesson = Lesson(
                title=title,
                author=author,
                subject=subject,
                material_type=material_type,
                description=description,
                license_info=license_info,
                keywords=keywords,
                date_created=date_created,
                resource_url=resource_url
            )

            lesson_data.append(lesson.to_serializable())  # Convert data for JSON serialization

        except Exception as e:
            print(f"Error processing lesson: {e}")

    return lesson_data



## 6. Creating json file to save the scraped data


In [5]:
import json

def save_data_to_json(data, file_name='lessons.json'):
    with open(file_name, 'w') as f:
        json.dump(data, f, indent=4)  # Data is now serializable
        print(f"Data saved to {file_name}")


## 7. Finally Scraping data from OerCommons and saving it to json file


In [6]:
def run_etl():
    print("Scraping lessons...")
    lessons = scrape_lessons()
    
    print("Saving data to JSON...")
    save_data_to_json(lessons)

# Run the ETL process
run_etl()


Scraping lessons...
Saving data to JSON...
Data saved to lessons.json
