In [2]:
import requests
import os
import json
import time
import json
import pandas as pd

from typing import Dict, List
from requests.auth import HTTPBasicAuth
from datetime import datetime
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv
# from src.utils import write_data_to_json_on_disk

In [3]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

CLIENT_ID = os.environ.get("UDEMY_CLIENT_ID")
CLIENT_SECRET = os.environ.get("UDEMY_CLIENT_SECRET")

# Helper functions

In [12]:
def request_data(url, current_session) -> Dict:
    try:
        response = current_session.get(url, auth=HTTPBasicAuth(CLIENT_ID, CLIENT_SECRET))

        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as errh:
        print(errh)
    except requests.exceptions.ConnectionError as errc:
        print(errc)
    except requests.exceptions.Timeout as errt:
        print(errt)
    except requests.exceptions.RequestException as err:
        print(err)

In [5]:
def write_data_to_json_on_disk(courses: List, file_location: str) -> None:
    with open(file_location, 'w') as output_file:
        json.dump(courses, output_file)

# Extract which data I need from the courses

In [33]:
def get_courses_list():
    """Calls the courses endpoint, gets all the pages with the courses and writes it on disk"""
    session = requests.Session()

    courses_list_endpoint = 'https://www.udemy.com/api-2.0/courses/'

    # get the list of courses available on Udemy
    response = request_data(courses_list_endpoint, session)
    print(response)
    
    courses_list = response['results']
    i = 0
    while response['next'] is not None:
        print(response['next'])
        time.sleep(2)
        response = request_data(response['next'], session)
        courses_list.extend(response['results'])
        
        print(f'i = {i}')
        if i == 10:
            break
        else:
            i+=1
        
    # write first set of results on disk
    file_name = f'courses_list_{str(datetime.now().date())}.json'
    file_location = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'data/raw', file_name)
    
    print(f"Writing the results of the courses list in {file_location}")
    write_data_to_json_on_disk(courses=courses_list, file_location=file_location)
    
    return courses_list

In [24]:
def get_course_details(course_id: str) -> Dict:
    session = requests.Session()
    course_endpoint = f"https://www.udemy.com/api-2.0/courses/{course_id}/?fields[course]=title,headline,description,url,visible_instructors,primary_category,primary_subcategory,status_label"

    """Gets a course id, calls the course details endpoint and returns the dictionary with the course details for this specific course"""
    single_courses_details = request_data(course_endpoint, session)
    return single_courses_details

In [31]:
def get_course_curriculum(course_id: str):
    """Gets a course id and returns the curriculum for this course

    Args:
        course_id (str): _description_

    Returns:
        _type_: _description_
    """
    session = requests.Session()
    course_curriculum_endpoint = f'https://www.udemy.com/api-2.0/courses/{course_id}/public-curriculum-items/'
    response = request_data(course_curriculum_endpoint, session)
    print(response)
    
    courses_curriculum = response['results']
    
    while response['next'] is not None:
        print(response['next'])
        time.sleep(2)
        response = request_data(response['next'], session)
        courses_curriculum.extend(response['results'])
    
    return courses_curriculum

# Calling everything

In [34]:
all_courses = get_courses_list()

# let's get the details and the public curriculum for each course
course_details_list = []
course_curriculum_list = []

for course in all_courses:
    course_id = course['id']
    
    print(f'Getting the course details for course: {course_id}')
    course_details = get_course_details(course_id=course_id)
    course_details_list.append(course_details)
    
    print(f'Getting the curriculum for course: {course_id}')
    course_curriculum = get_course_curriculum(course_id=course_id)
    public_curriculum_items = {'course_id': course_id, 'curriculum': course_curriculum}
    course_curriculum_list.append(public_curriculum_items)

    

{'count': 10000, 'next': 'https://www.udemy.com/api-2.0/courses/?page=2&page_size=12', 'previous': None, 'results': [{'_class': 'course', 'id': 567828, 'title': 'The Complete Python Bootcamp From Zero to Hero in Python', 'url': '/course/complete-python-bootcamp/', 'is_paid': True, 'price': '£54.99', 'price_detail': {'amount': 54.99, 'currency': 'GBP', 'price_string': '£54.99', 'currency_symbol': '£'}, 'price_serve_tracking_id': '7nL36tdaRuWNN_2wSFxPBg', 'visible_instructors': [{'_class': 'user', 'title': 'Jose Portilla', 'name': 'Jose', 'display_name': 'Jose Portilla', 'job_title': 'Head of Data Science at Pierian Training', 'image_50x50': 'https://img-c.udemycdn.com/user/50x50/9685726_67e7_4.jpg', 'image_100x100': 'https://img-c.udemycdn.com/user/100x100/9685726_67e7_4.jpg', 'initials': 'JP', 'url': '/user/joseportilla/'}], 'image_125_H': 'https://img-c.udemycdn.com/course/125_H/567828_67d0.jpg', 'image_240x135': 'https://img-c.udemycdn.com/course/240x135/567828_67d0.jpg', 'is_practic

KeyboardInterrupt: 

In [37]:
course_details_list

[{'_class': 'course',
  'id': 567828,
  'title': 'The Complete Python Bootcamp From Zero to Hero in Python',
  'url': '/course/complete-python-bootcamp/',
  'visible_instructors': [{'_class': 'user',
    'title': 'Jose Portilla',
    'name': 'Jose',
    'display_name': 'Jose Portilla',
    'job_title': 'Head of Data Science at Pierian Training',
    'image_50x50': 'https://img-c.udemycdn.com/user/50x50/9685726_67e7_4.jpg',
    'image_100x100': 'https://img-c.udemycdn.com/user/100x100/9685726_67e7_4.jpg',
    'initials': 'JP',
    'url': '/user/joseportilla/'}],
  'description': "<p><strong>Become a Python Programmer and learn one of employer's most requested skills of 2023!</strong><br></p><p>This is the <strong>most comprehensive, yet straight-forward, course for the Python programming language on Udemy!</strong> Whether you have never programmed before, already know basic syntax, or want to learn about the advanced features of Python, this course is for you! In this course we will&nb

In [40]:
course_curriculum_list

[{'course_id': 567828,
  'curriculum': [{'_class': 'chapter',
    'id': 891116,
    'created': '2015-08-10T21:26:19Z',
    'sort_order': 308,
    'title': 'Course Overview',
    'description': 'Get an introduction to the course!',
    'is_published': True},
   {'_class': 'lecture',
    'id': 20205526,
    'title': 'Auto-Welcome Message',
    'created': '2020-06-01T18:27:30Z',
    'description': '<p>Welcome to the Complete Python Bootcamp</p>',
    'title_cleaned': 'auto-welcome-message',
    'is_published': True,
    'transcript': '',
    'is_downloadable': True,
    'is_free': False,
    'asset': {'_class': 'asset',
     'id': 25768676,
     'asset_type': 'Article',
     'title': '',
     'created': '2020-06-25T17:15:12Z'},
    'sort_order': 307,
    'can_be_previewed': False},
   {'_class': 'lecture',
    'id': 3421822,
    'title': 'Course Introduction',
    'created': '2015-08-10T21:26:19Z',
    'description': '<p>The Complete Python Bootcamp. </p>',
    'title_cleaned': 'course-in

In [None]:
title = []
url = []
id = []
headline = []
description = []
primary_category = []
primary_subcategory = []

for course in course_details:
    title.append(course['title'])
    url.append(course['url'])
    id.append(course['id'])
    headline.append(course['headline'])
    description.append(course['description'])
    primary_category.append(course['primary_category']['title_cleaned'])
    primary_subcategory.append(course['primary_subcategory']['title_cleaned'])

    
d = {'title': title, 'url': url, 'id': id, 'headline': headline, 'description': description, 'primary_category': primary_category, 'primary_subcategory': primary_subcategory}
df_courses = pd.DataFrame.from_dict(d, orient='columns')

# Earlier implementations

In [15]:
with open('../data/raw/2023-06-20.json', 'r') as f:
    all_courses = json.load(f)

In [None]:
title = []
url = []
id = []
headline = []
for course in all_courses:
    title.append(course['title'])
    url.append(course['url'])
    id.append(course['id'])
    headline.append(course['headline'])
    
d = {'title': title, 'url': url, 'id': id, 'headline': headline}
df_courses = pd.DataFrame.from_dict(d, orient='columns')

# Get the courses public curriculum

In [None]:
public_curriculum = []
session = requests.Session()
for course in all_courses:
    course_id = course['id']
    print(course_id)
    course_curriculum = request_data(f'https://www.udemy.com/api-2.0/courses/{course_id}/public-curriculum-items/', session)
    print(course_curriculum)

    public_curriculum_items = {'course_id': course_id, 'curriculum': course_curriculum['results']}
    while course_curriculum['next'] is not None:
        next_page = course_curriculum['next']
        print(next_page)
        time.sleep(5)
        course_curriculum = request_data(next_page, session)
        print(course_curriculum)
        public_curriculum_items['curriculum'].extend(course_curriculum['results'])
    public_curriculum.append(public_curriculum_items)

In [None]:
dataframes = []
for curriculum_item in public_curriculum:
    print(curriculum_item['course_id'])
    print(curriculum_item['curriculum'])
    
    class_item = []
    id = []
    title = []
    description = []

    for curriculum_item_inner in curriculum_item['curriculum']:
        class_item.append(curriculum_item_inner['_class'])
        id.append(curriculum_item_inner['id'])
        title.append(curriculum_item_inner['title'])
        # description.append(curriculum_item_inner['description'])
        
    d = {'class': class_item, 'id': curriculum_item['course_id'], 'title': title}
    df_curriculum = pd.DataFrame.from_dict(d, orient='columns')
    dataframes.append(df_curriculum)

In [None]:
df_curriculum = pd.concat(dataframes)

In [None]:
df_curriculum.head()

In [None]:
df_curriculum.shape

# Combine the courses with the curriculum data

In [None]:
df = df_courses.merge(df_curriculum, on='id', how='left')

# Get the course details

In [16]:
course_details = []
session = requests.Session()
for course_info in all_courses:
    print(course_info)
    course_id = course_info['id']
    
    course_endpoint = f"https://www.udemy.com/api-2.0/courses/{course_id}/?fields[course]=title,headline,description,url,visible_instructors,primary_category,primary_subcategory,status_label"
    single_courses_details = request_data(course_endpoint, session)
    course_details.append(single_courses_details)
    break
    

{'_class': 'course', 'id': 473160, 'title': 'Web Design for Web Developers: Build Beautiful Websites!', 'url': '/course/web-design-secrets/', 'is_paid': False, 'price': 'Free', 'price_detail': None, 'price_serve_tracking_id': 'YNFQd2mGQzOCDYX443_5Fg', 'visible_instructors': [{'_class': 'user', 'title': 'Jonas Schmedtmann', 'name': 'Jonas', 'display_name': 'Jonas Schmedtmann', 'job_title': 'Web Developer, Designer, and Teacher', 'image_50x50': 'https://img-c.udemycdn.com/user/50x50/7799204_2091_5.jpg', 'image_100x100': 'https://img-c.udemycdn.com/user/100x100/7799204_2091_5.jpg', 'initials': 'JS', 'url': '/user/jonasschmedtmann/'}], 'image_125_H': 'https://img-c.udemycdn.com/course/125_H/473160_d929_3.jpg', 'image_240x135': 'https://img-c.udemycdn.com/course/240x135/473160_d929_3.jpg', 'is_practice_test_course': False, 'image_480x270': 'https://img-c.udemycdn.com/course/480x270/473160_d929_3.jpg', 'published_title': 'web-design-secrets', 'tracking_id': '', 'locale': {'_class': 'locale',

In [None]:
len(course_details)

In [None]:
for course in course_details:
    print(course['primary_category']['title_cleaned'])
    print(course['primary_subcategory']['title_cleaned'])

In [None]:
title = []
url = []
id = []
headline = []
description = []
primary_category = []
primary_subcategory = []

for course in course_details:
    title.append(course['title'])
    url.append(course['url'])
    id.append(course['id'])
    headline.append(course['headline'])
    description.append(course['description'])
    primary_category.append(course['primary_category']['title_cleaned'])
    primary_subcategory.append(course['primary_subcategory']['title_cleaned'])

    
d = {'title': title, 'url': url, 'id': id, 'headline': headline, 'description': description, 'primary_category': primary_category, 'primary_subcategory': primary_subcategory}
df_courses = pd.DataFrame.from_dict(d, orient='columns')

In [None]:
df_courses.head()

In [None]:
df_courses.tail()

In [None]:
df_courses.to_csv('../data/processed/courses.csv', index=False)

# Remove html tags from description

In [None]:
df_courses = pd.read_csv('../data/processed/courses.csv')

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    return clean_text

In [None]:
df_courses['description_cleaned'] = df_courses['description'].apply(remove_html_tags)

In [None]:
df_courses.head()

In [None]:
df_courses.loc[2, 'url']

In [None]:
df_courses.loc[2, 'description_cleaned']

In [None]:
df_courses.loc[2, 'headline']

In [None]:
df_courses.loc[2, 'id']

In [None]:
public_curriculum = []
session = requests.Session()
course_id = 793796
course_curriculum = request_data(f'https://www.udemy.com/api-2.0/courses/{course_id}/public-curriculum-items/', session)
print(course_curriculum)

public_curriculum_items = {'course_id': course_id, 'curriculum': course_curriculum['results']}
while course_curriculum['next'] is not None:
    next_page = course_curriculum['next']
    print(next_page)
    time.sleep(5)
    course_curriculum = request_data(next_page, session)
    print(course_curriculum)
    public_curriculum_items['curriculum'].extend(course_curriculum['results'])
public_curriculum.append(public_curriculum_items)

In [None]:
public_curriculum

In [None]:
dataframes = []
for curriculum_item in public_curriculum:
    print(curriculum_item['course_id'])
    print(curriculum_item['curriculum'])
    
    class_item = []
    id = []
    title = []
    description = []

    for curriculum_item_inner in curriculum_item['curriculum']:
        class_item.append(curriculum_item_inner['_class'])
        id.append(curriculum_item_inner['id'])
        title.append(curriculum_item_inner['title'])
        description.append(curriculum_item_inner['description'])
        
    d = {'class': class_item, 'id': curriculum_item['course_id'], 'title': title}
    df_curriculum = pd.DataFrame.from_dict(d, orient='columns')
    dataframes.append(df_curriculum)

In [None]:
test = pd.concat(dataframes)

In [None]:
test

In [None]:
test['title'].head(50)

In [None]:
test

In [None]:
curriculum = []
for index, title in test.iterrows():
    curriculum.append(title['title'])

In [None]:
print(".".join(curriculum))