In [None]:
import requests
import os
import json
import time
import json
import pandas as pd

from typing import Dict, List
from requests.auth import HTTPBasicAuth
from datetime import datetime
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv

from sklearn.feature_extraction.text import TfidfVectorizer
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import re
import spacy

In [None]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

ELASTIC_PASSWORD = os.environ.get("ELASTIC_USER_PASSWORD")

In [None]:
df_courses = pd.read_csv('../data/interim/courses_sample.csv')

In [None]:
df_courses.head()

# Running Elasticsearch

In [None]:
INDEX = 'udemy_data'
DOMAIN = '0.0.0.0'
es = Elasticsearch(
    hosts="https://localhost:9200",
    ca_certs="../http_ca.crt",
    http_auth=("elastic", ELASTIC_PASSWORD)
)

In [None]:
es

## Create the index

In [None]:
def check_and_create_index(es, index: str):
    # define data model
    mappings = {
        'mappings': {
            'properties': {
                'title': {'type': 'text'},
                'headline': {'type': 'text'},
                'description': {'type': 'text'},
                'primary_subcategory': {'type': 'text'},
                'url': {'type': 'text'}
            }
        }
    }
    if not es.indices.exists(index=index):
        es.indices.create(index=index, body=mappings, ignore=400)

In [None]:
check_and_create_index(es, index=INDEX)

In [None]:
es.indices.delete(index=INDEX)

## Populate the index

In [None]:
# add data to the index
for index, course in df_courses.iterrows():
    print(index)
    
    doc = {
        'description': course['description'],
        'headline': course['headline'],
        'title': course['title'],
        'primary_subcategory': course['primary_subcategory'],
        'url': course['url']
    }
    
    resp = es.index(index=INDEX, id=index, document=doc)
    print(resp['result'])

## Search the index

In [None]:
resp = es.search(index=INDEX, query={"match_all": {}})
print("Got %d Hits:" % resp['hits']['total']['value'])
for hit in resp['hits']['hits']:
    print(hit)

In [None]:
s = Search(using=es, index=INDEX).query("match", title="excel")
response = s.execute()

for hit in response:
    print(hit.url, hit.title, hit.headline)