In [1]:
import requests
import os
import json
import time
import json
import pandas as pd

from typing import Dict, List
from requests.auth import HTTPBasicAuth
from datetime import datetime
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv

from sklearn.feature_extraction.text import TfidfVectorizer
from elasticsearch import Elasticsearch
import re
import spacy

In [5]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

ELASTIC_PASSWORD = os.environ.get("ELASTIC_USER_PASSWORD")

In [3]:
df_courses = pd.read_csv('../data/interim/courses_sample.csv')

In [4]:
df_courses.head()

Unnamed: 0,title,url,id,headline,description,primary_category,primary_subcategory,description_cleaned
0,The Complete Python Bootcamp From Zero to Hero...,/course/complete-python-bootcamp/,567828,Learn Python like a Professional Start from t...,<p><strong>Become a Python Programmer and lear...,development,programming-languages,Become a Python Programmer and learn one of em...
1,Web Design for Web Developers: Build Beautiful...,/course/web-design-secrets/,473160,Learn web design in 1 hour with 25+ simple-to-...,<p><strong><em>IMPORTANT NOTE: The material of...,development,web-development,IMPORTANT NOTE: The material of this course is...
2,Microsoft Excel - Excel from Beginner to Advanced,/course/microsoft-excel-2013-from-beginner-to-...,793796,Excel with this A-Z Microsoft Excel Course. Mi...,<p><strong>Microsoft Excel all in One Package<...,office-productivity,microsoft,Microsoft Excel all in One PackageThis Microso...
3,The Complete 2023 Web Development Bootcamp,/course/the-complete-web-development-bootcamp/,1565838,Become a Full-Stack Web Developer with just ON...,<p>Welcome to the Complete Web Development Boo...,development,web-development,Welcome to the Complete Web Development Bootca...
4,100 Days of Code: The Complete Python Pro Boot...,/course/100-days-of-code/,2776760,Master Python by building 100 projects in 100 ...,<p>Welcome to the 100 Days of Code - The Compl...,development,programming-languages,Welcome to the 100 Days of Code - The Complete...


# Running Elasticsearch

In [9]:
INDEX = 'udemy_data'
DOMAIN = '0.0.0.0'
es = Elasticsearch(
    hosts="https://localhost:9200",
    ca_certs="../http_ca.crt",
    http_auth=("elastic", ELASTIC_PASSWORD)
)

In [10]:
es

<Elasticsearch([{'host': 'localhost', 'port': 9200, 'use_ssl': True}])>

## Create the index

In [11]:
def check_and_create_index(es, index: str):
    # define data model
    mappings = {
        'mappings': {
            'properties': {
                'title': {'type': 'text'},
                'headline': {'type': 'text'},
                'description': {'type': 'text'},
                'primary_subcategory': {'type': 'text'}
            }
        }
    }
    if not es.indices.exists(index=index):
        es.indices.create(index=index, body=mappings, ignore=400)

In [12]:
check_and_create_index(es, index=INDEX)

  es.indices.create(index=index, body=mappings, ignore=400)


## Populate the index

In [14]:
# add data to the index
for index, course in df_courses.iterrows():
    print(index)
    
    doc = {
        'description': course['description'],
        'headline': course['headline'],
        'title': course['title'],
        'primary_subcategory': course['primary_subcategory']
    }
    
    resp = es.index(index=INDEX, id=index, document=doc)
    print(resp['result'])

0
created
1
created
2
created
3
created
4
created
5
created
6
created
7
created
8
created
9
created
10
created
11
created
12
created
13
created
14
created
15
created
16
created
17
created
18
created
19
created
20
created
21
created
22
created
23
created
24
created
25
created
26
created
27
created
28
created
29
created
30
created
31
created
32
created
33
created
34
created
35
created
36
created
37
created
38
created
39
created
40
created
41
created
42
created
43
created
44
created
45
created
46
created
47
created
48
created
49
created
50
created
51
created
52
created
53
created
54
created
55
created
56
created
57
created
58
created
59
created
60
created
61
created
62
created
63
created
64
created
65
created
66
created
67
created
68
created
69
created
70
created
71
created
72
created
73
created
74
created
75
created
76
created
77
created
78
created
79
created
80
created
81
created
82
created
83
created
84
created
85
created
86
created
87
created
88
created
89
created
90
created
91
create

## Search the index

In [16]:
resp = es.search(index=INDEX, query={"match_all": {}})
print("Got %d Hits:" % resp['hits']['total']['value'])
for hit in resp['hits']['hits']:
    print(hit)

Got 132 Hits:
{'_index': 'udemy_data', '_id': '0', '_score': 1.0, '_source': {'description': "<p><strong>Become a Python Programmer and learn one of employer's most requested skills of 2023!</strong><br></p><p>This is the <strong>most comprehensive, yet straight-forward, course for the Python programming language on Udemy!</strong> Whether you have never programmed before, already know basic syntax, or want to learn about the advanced features of Python, this course is for you! In this course we will&nbsp;<strong>teach you Python 3. </strong></p><p>With <strong>over 100 lectures</strong> and more than 21 hours of video this comprehensive course leaves no stone unturned! This course includes quizzes, tests, coding exercises and homework assignments as well as 3 major projects to create a Python project portfolio!</p><p><strong>Learn how to use Python for real-world tasks, such as working with PDF Files, sending emails, reading Excel files, Scraping websites for informations, working wit

In [17]:
from elasticsearch_dsl import Search

In [31]:
s = Search(using=es, index=INDEX).query("match", title="excel")

In [32]:
response = s.execute()

for hit in response:
    print(hit.meta.score, hit.title)

4.590371 Microsoft Excel - Advanced Excel Formulas & Functions
4.4090395 Microsoft Excel - Excel from Beginner to Advanced
4.2414894 Microsoft Excel - Data Analysis with Excel Pivot Tables
3.888488 Useful Excel for Beginners
3.4128861 Microsoft Excel - Improve your skills quickly


In [26]:
response

<Response: [<Hit(udemy_data/0): {'description': "<p><strong>Become a Python Programmer and l...}>, <Hit(udemy_data/6): {'description': '<p>  \tDo you want to become a programmer? ...}>, <Hit(udemy_data/63): {'description': '<p>Whether you want to:</p><p>- build the s...}>, <Hit(udemy_data/115): {'description': "<p>Welcome to the ultimate Python learning ...}>, <Hit(udemy_data/39): {'description': '<p style="">If you\'re an office worker, st...}>, <Hit(udemy_data/42): {'description': "<p>This course provides an introduction to ...}>, <Hit(udemy_data/28): {'description': "<p>Are you ready to start your path to beco...}>, <Hit(udemy_data/88): {'description': "<p><strong>Just updated for 2023! It's time...}>, <Hit(udemy_data/23): {'description': '<p>Welcome! This is <strong>Deep Learning, ...}>, <Hit(udemy_data/4): {'description': '<p>Welcome to the 100 Days of Code - The Co...}>]>