In [1]:
import requests
import os
import json
import time
import json
import pandas as pd

from typing import Dict, List
from requests.auth import HTTPBasicAuth
from datetime import datetime
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv
from tenacity import retry, wait_exponential

In [2]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

CLIENT_ID = os.environ.get("UDEMY_CLIENT_ID")
CLIENT_SECRET = os.environ.get("UDEMY_CLIENT_SECRET")

# Helper functions

In [3]:
@retry(wait=wait_exponential(multiplier=1, min=4, max=10))
def request_data(url, current_session) -> Dict: 
    try:
        response = current_session.get(url, auth=HTTPBasicAuth(CLIENT_ID, CLIENT_SECRET), timeout=30)

        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as errh:
        print(errh)
        raise
    except requests.exceptions.ConnectionError as errc:
        print(errc)
        raise
    except requests.exceptions.Timeout as errt:
        print(errt)
        raise
    except requests.exceptions.RequestException as err:
        print(err)
        raise

In [4]:
def write_data_to_json_on_disk(courses: List, file_location: str) -> None:
    with open(file_location, 'w') as output_file:
        json.dump(courses, output_file)

In [5]:
def write_data_to_disk(data: List, file_location: str) -> None:
    if os.path.exists(file_location):
        print(f'File {file_location} exists on disk')
        with open(file_location, 'r') as f:
            print('Read file from disk')
            existing_data = json.load(f)
            existing_data.extend(data)
        
        with open(file_location, 'w') as output_file:
            print('Writing new data on disk')
            json.dump(existing_data, output_file)  
        
    else:
        print(f"File {file_location} doesn't exist. Writing the data on disk...")
        with open(file_location, 'w') as output_file:
            json.dump(data, output_file)

# Extract which data I need from the courses

In [6]:
def get_courses_list():
    """Calls the courses endpoint, gets all the pages with the courses and writes it on disk"""
    session = requests.Session()

    courses_list_endpoint = 'https://www.udemy.com/api-2.0/courses/?page_size=6'

    # get the list of courses available on Udemy
    response = request_data(courses_list_endpoint, session)
    print(response)
    
    courses_list = response['results']
    i = 0
    while response['next'] is not None:
        print(response['next'])
        time.sleep(10)
        response = request_data(response['next'], session)
        courses_list.extend(response['results'])
        
        print(f'i = {i}')
        if i == 20:
            break
        else:
            i+=1
        
    # write first set of results on disk
    file_name = f'courses_list_{str(datetime.now().date())}.json'
    file_location = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'data/raw', file_name)
    
    print(f"Writing the results of the courses list in {file_location}")
    write_data_to_json_on_disk(courses=courses_list, file_location=file_location)
    
    return courses_list

In [7]:
def get_course_details(course_id: str) -> Dict:
    session = requests.Session()
    course_endpoint = f"https://www.udemy.com/api-2.0/courses/{course_id}/?fields[course]=title,headline,description,url,visible_instructors,primary_category,primary_subcategory,status_label"
    print(course_endpoint)
    """Gets a course id, calls the course details endpoint and returns the dictionary with the course details for this specific course"""
    single_courses_details = request_data(course_endpoint, session)
    return single_courses_details

In [8]:
def get_course_curriculum(course_id: str):
    """Gets a course id and returns the curriculum for this course

    Args:
        course_id (str): _description_

    Returns:
        _type_: _description_
    """
    session = requests.Session()
    course_curriculum_endpoint = f'https://www.udemy.com/api-2.0/courses/{course_id}/public-curriculum-items/?page_size=6'
    response = request_data(course_curriculum_endpoint, session)
    print(response)
    
    courses_curriculum = response['results']
    
    while response['next'] is not None:
        print(response['next'])
        time.sleep(2)
        response = request_data(response['next'], session)
        courses_curriculum.extend(response['results'])
    
    return courses_curriculum

# Calling everything

In [9]:
with open('../data/raw/courses_list_2023-06-27.json', 'r') as f:
    all_courses = json.load(f)

In [None]:
# get the list of courses from the udemy API
all_courses = get_courses_list()

In [13]:
course_details_list = []

for course in all_courses:
    course_id = course['id']
    
    print(f'Getting the course details for course: {course_id}')
    course_details = get_course_details(course_id=course_id)
    course_details_list.append(course_details)
    
file_name = f'courses_details_{str(datetime.now().date())}.json'
file_location = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'data/raw', file_name)
write_data_to_json_on_disk(courses=course_details_list, file_location=file_location)

Getting the course details for course: 567828
Getting the course details for course: 473160
Getting the course details for course: 793796
Getting the course details for course: 1565838
Getting the course details for course: 2776760
Getting the course details for course: 3633804
Getting the course details for course: 433798
Getting the course details for course: 24823
Getting the course details for course: 53600
Getting the course details for course: 851712
Getting the course details for course: 2196488
Getting the course details for course: 625204
Getting the course details for course: 3142166
Getting the course details for course: 1362070
Getting the course details for course: 950390
Getting the course details for course: 1331946
Getting the course details for course: 762616
Getting the course details for course: 4290300
Getting the course details for course: 756150
Getting the course details for course: 533682
Getting the course details for course: 914296
Getting the course details f

TypeError: write_data_to_json_on_disk() got an unexpected keyword argument 'data'

In [10]:
# get the details and the public curriculum for each course
course_curriculum_list = []

for course in all_courses:
    course_id = course['id']
    time.sleep(5)
    
    print(f'Getting the curriculum for course: {course_id}')
    course_curriculum = get_course_curriculum(course_id=course_id)
    public_curriculum_items = {'course_id': course_id, 'curriculum': course_curriculum}
    course_curriculum_list.append(public_curriculum_items)
    
file_name = f'courses_curriculum_{str(datetime.now().date())}.json'
file_location = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'data/raw', file_name)
write_data_to_json_on_disk(courses=course_details_list, file_location=file_location)

Getting the curriculum for course: 567828
{'count': 205, 'next': 'https://www.udemy.com/api-2.0/courses/567828/public-curriculum-items/?page=2&page_size=6', 'previous': None, 'results': [{'_class': 'chapter', 'id': 891116, 'created': '2015-08-10T21:26:19Z', 'sort_order': 308, 'title': 'Course Overview', 'description': 'Get an introduction to the course!', 'is_published': True}, {'_class': 'lecture', 'id': 20205526, 'title': 'Auto-Welcome Message', 'created': '2020-06-01T18:27:30Z', 'description': '<p>Welcome to the Complete Python Bootcamp</p>', 'title_cleaned': 'auto-welcome-message', 'is_published': True, 'transcript': '', 'is_downloadable': True, 'is_free': False, 'asset': {'_class': 'asset', 'id': 25768676, 'asset_type': 'Article', 'title': '', 'created': '2020-06-25T17:15:12Z'}, 'sort_order': 307, 'can_be_previewed': False}, {'_class': 'lecture', 'id': 3421822, 'title': 'Course Introduction', 'created': '2015-08-10T21:26:19Z', 'description': '<p>The Complete Python Bootcamp. </p>'

NameError: name 'course_details_list' is not defined

In [11]:
write_data_to_json_on_disk(courses=course_curriculum_list, file_location=file_location)

In [12]:
len(course_curriculum_list)

132

In [None]:
course_details_list

In [14]:
len(course_details_list)

132

In [15]:
title = []
url = []
id = []
headline = []
description = []
primary_category = []
primary_subcategory = []

for course in course_details_list:
    title.append(course['title'])
    url.append(course['url'])
    id.append(course['id'])
    headline.append(course['headline'])
    description.append(course['description'])
    primary_category.append(course['primary_category']['title_cleaned'])
    primary_subcategory.append(course['primary_subcategory']['title_cleaned'])

    
d = {'title': title, 'url': url, 'id': id, 'headline': headline, 'description': description, 'primary_category': primary_category, 'primary_subcategory': primary_subcategory}
df_courses = pd.DataFrame.from_dict(d, orient='columns')

In [16]:
df_courses.head()

Unnamed: 0,title,url,id,headline,description,primary_category,primary_subcategory
0,The Complete Python Bootcamp From Zero to Hero...,/course/complete-python-bootcamp/,567828,Learn Python like a Professional Start from t...,<p><strong>Become a Python Programmer and lear...,development,programming-languages
1,Web Design for Web Developers: Build Beautiful...,/course/web-design-secrets/,473160,Learn web design in 1 hour with 25+ simple-to-...,<p><strong><em>IMPORTANT NOTE: The material of...,development,web-development
2,Microsoft Excel - Excel from Beginner to Advanced,/course/microsoft-excel-2013-from-beginner-to-...,793796,Excel with this A-Z Microsoft Excel Course. Mi...,<p><strong>Microsoft Excel all in One Package<...,office-productivity,microsoft
3,The Complete 2023 Web Development Bootcamp,/course/the-complete-web-development-bootcamp/,1565838,Become a Full-Stack Web Developer with just ON...,<p>Welcome to the Complete Web Development Boo...,development,web-development
4,100 Days of Code: The Complete Python Pro Boot...,/course/100-days-of-code/,2776760,Master Python by building 100 projects in 100 ...,<p>Welcome to the 100 Days of Code - The Compl...,development,programming-languages


In [17]:
df_courses.shape

(132, 7)

# Remove html tags from description

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    return clean_text

In [None]:
df_courses['description_cleaned'] = df_courses['description'].apply(remove_html_tags)