In [None]:
import requests
import os
import json
import time
import json
import pandas as pd

from typing import Dict, List
from requests.auth import HTTPBasicAuth
from datetime import datetime
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv

In [None]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

CLIENT_ID = os.environ.get("UDEMY_CLIENT_ID")
CLIENT_SECRET = os.environ.get("UDEMY_CLIENT_SECRET")

# Call the API and get a list of courses

In [None]:
def request_data(url: str, session) -> Dict:
    try:
        response = session.get(url,
                               auth=HTTPBasicAuth(CLIENT_ID, CLIENT_SECRET))

        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as errh:
        print(errh)
    except requests.exceptions.ConnectionError as errc:
        print(errc)
    except requests.exceptions.Timeout as errt:
        print(errt)
    except requests.exceptions.RequestException as err:
        print(err)

In [None]:
def write_data_to_json_on_disk(courses: List, file_location: str) -> None:
    with open(file_location, 'w') as output_file:
        json.dump(courses, output_file)

In [None]:
session = requests.Session()
root_endpoint = 'https://www.udemy.com/api-2.0/courses/'
courses_list = request_data(root_endpoint, session)
print(courses_list)

all_courses = courses_list['results']
while courses_list['next'] is not None:
    next_page = courses_list['next']
    print(next_page)
    time.sleep(2)
    courses_list = request_data(next_page, session)
    all_courses.extend(courses_list['results'])

In [None]:
file_name = str(datetime.now().date()) + '.json'
file_location = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'data/raw', file_name)

write_data_to_json_on_disk(courses=all_courses, file_location=file_location)

# Extract which data I need from the courses

In [None]:
with open('../data/raw/2023-06-20.json', 'r') as f:
    all_courses = json.load(f)

In [None]:
title = []
url = []
id = []
headline = []
for course in all_courses:
    title.append(course['title'])
    url.append(course['url'])
    id.append(course['id'])
    headline.append(course['headline'])
    
d = {'title': title, 'url': url, 'id': id, 'headline': headline}
df_courses = pd.DataFrame.from_dict(d, orient='columns')

In [None]:
df_courses.shape

In [None]:
df_courses.head()

In [None]:
df_courses.to_csv('../data/processed/courses.csv', index=False)

# Get the courses public curriculum

In [None]:
public_curriculum = []
session = requests.Session()
for course in all_courses:
    course_id = course['id']
    print(course_id)
    course_curriculum = request_data(f'https://www.udemy.com/api-2.0/courses/{course_id}/public-curriculum-items/', session)
    print(course_curriculum)

    public_curriculum_items = {'course_id': course_id, 'curriculum': course_curriculum['results']}
    while course_curriculum['next'] is not None:
        next_page = course_curriculum['next']
        print(next_page)
        time.sleep(5)
        course_curriculum = request_data(next_page, session)
        print(course_curriculum)
        public_curriculum_items['curriculum'].extend(course_curriculum['results'])
    public_curriculum.append(public_curriculum_items)

In [None]:
dataframes = []
for curriculum_item in public_curriculum:
    print(curriculum_item['course_id'])
    print(curriculum_item['curriculum'])
    
    class_item = []
    id = []
    title = []
    description = []

    for curriculum_item_inner in curriculum_item['curriculum']:
        class_item.append(curriculum_item_inner['_class'])
        id.append(curriculum_item_inner['id'])
        title.append(curriculum_item_inner['title'])
        # description.append(curriculum_item_inner['description'])
        
    d = {'class': class_item, 'id': curriculum_item['course_id'], 'title': title}
    df_curriculum = pd.DataFrame.from_dict(d, orient='columns')
    dataframes.append(df_curriculum)

In [None]:
df_curriculum = pd.concat(dataframes)

In [None]:
df_curriculum.head()

In [None]:
df_curriculum.shape

# Combine the courses with the curriculum data

In [None]:
df_curriculum.shape

In [None]:
df_courses.shape

In [None]:
df_curriculum.head()

In [None]:
df_courses.head()

In [None]:
df = df_courses.merge(df_curriculum, on='id', how='left')

In [None]:
df.shape

In [None]:
df.head()

# Exploring the data

In [None]:
df['title_x'].value_counts()

In [None]:
df[df['title_x'] == 'ChatGPT, Midjourney, Firefly, Bard, DALL-E, AI Crash Course']

In [None]:
df.loc[3081, 'headline']

In [None]:
df.loc[3081, 'url']

# Get the course details

In [None]:
course_details = []
session = requests.Session()
for course_info in all_courses:
    print(course_info)
    course_id = course_info['id']
    
    course_endpoint = f"https://www.udemy.com/api-2.0/courses/{course_id}/?fields[course]=title,headline,description,url,visible_instructors,primary_category,primary_subcategory,status_label"
    single_courses_details = request_data(course_endpoint, session)
    course_details.append(single_courses_details)
    

In [None]:
len(course_details)

In [None]:
for course in course_details:
    print(course['primary_category']['title_cleaned'])
    print(course['primary_subcategory']['title_cleaned'])

In [None]:
title = []
url = []
id = []
headline = []
description = []
primary_category = []
primary_subcategory = []

for course in course_details:
    title.append(course['title'])
    url.append(course['url'])
    id.append(course['id'])
    headline.append(course['headline'])
    description.append(course['description'])
    primary_category.append(course['primary_category']['title_cleaned'])
    primary_subcategory.append(course['primary_subcategory']['title_cleaned'])

    
d = {'title': title, 'url': url, 'id': id, 'headline': headline, 'description': description, 'primary_category': primary_category, 'primary_subcategory': primary_subcategory}
df_courses = pd.DataFrame.from_dict(d, orient='columns')

In [None]:
df_courses.head()

In [None]:
df_courses.tail()

In [None]:
df_courses.to_csv('../data/processed/courses.csv', index=False)

# Remove html tags from description

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    return clean_text

In [None]:
df_courses['description_cleaned'] = df_courses['description'].apply(remove_html_tags)

In [None]:
df_courses.head()

In [None]:
df_courses.loc[2, 'url']

In [None]:
df_courses.loc[2, 'description_cleaned']

In [None]:
df_courses.loc[2, 'headline']