In [None]:
import requests
import os
import json
import time
import json
import pandas as pd

from typing import Dict, List
from requests.auth import HTTPBasicAuth
from datetime import datetime
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv

from sklearn.feature_extraction.text import TfidfVectorizer
import re
import spacy

In [None]:
class UdemyAPI:
    def __init__(self) -> None:
        # find .env automagically by walking up directories until it's found
        dotenv_path = find_dotenv()

        # load up the entries as environment variables
        load_dotenv(dotenv_path)

        self.CLIENT_ID = os.environ.get("UDEMY_CLIENT_ID")
        self.CLIENT_SECRET = os.environ.get("UDEMY_CLIENT_SECRET")
        self.session = requests.Session()

    
    def request_data(self, url):
        try:
            response = self.session.get(url, auth=HTTPBasicAuth(self.CLIENT_ID, self.CLIENT_SECRET))

            response.raise_for_status()
            return response.json()
        except requests.exceptions.HTTPError as errh:
            print(errh)
        except requests.exceptions.ConnectionError as errc:
            print(errc)
        except requests.exceptions.Timeout as errt:
            print(errt)
        except requests.exceptions.RequestException as err:
            print(err)
    
    def request_course_data(self):
        pass
    
    def request_course_details(self):
        pass
    
    def request_course_curriculum(self):
        pass
    

# Read the course details data

In [None]:
with open('../data/raw/courses_details_2023-06-28.json', 'r') as f:
    course_details = json.load(f)

In [None]:
title = []
url = []
id = []
headline = []
description = []
primary_category = []
primary_subcategory = []

for course in course_details:
    title.append(course['title'])
    url.append(course['url'])
    id.append(course['id'])
    headline.append(course['headline'])
    description.append(course['description'])
    primary_category.append(course['primary_category']['title_cleaned'])
    primary_subcategory.append(course['primary_subcategory']['title_cleaned'])
    

    
d = {'title': title, 'url': url, 'id': id, 'headline': headline, 'description': description, 'primary_category': primary_category, 'primary_subcategory': primary_subcategory}
df_courses = pd.DataFrame.from_dict(d, orient='columns')

In [None]:
df_courses

In [None]:
df_courses.to_csv('../data/interim/courses_sample.csv', index=False)

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    clean_text = clean_text.replace('\xa0', ' ')
    return clean_text
df_courses['description_cleaned'] = df_courses['description'].apply(remove_html_tags)

In [None]:
def extract_keywords(text):
    # Initialize the TF-IDF vectorizer with n-gram range from 1 to 3 (unigrams, bigrams, and trigrams)
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')

    # Fit and transform the text
    tfidf_matrix = vectorizer.fit_transform([text])

    # Get the feature names (words, bigrams, and trigrams)
    feature_names = vectorizer.get_feature_names_out()

    # Create a dictionary to store the word scores
    word_scores = {}

    # Loop over the features and their scores
    for col in tfidf_matrix.nonzero()[1]:
        word_scores[feature_names[col]] = tfidf_matrix[0, col]

    # Sort the words based on their scores in descending order
    sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)

    # Extract the top 10 most important words, bigrams, and trigrams
    top_keywords = [keyword for keyword, score in sorted_words[:5]]

    return top_keywords

In [None]:
text = df_courses.loc[3, 'title'] + " " + df_courses.loc[3, 'headline']
keywords = extract_keywords(text)
print(keywords)