In [1]:
import pandas as pd
import numpy as np
import time
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

## Step 0: Load Data

In [9]:
course_info = pd.read_csv('data/course_catalog.csv')
course_info.head()

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...


In [10]:
course_info = course_info.astype(str)
course_info = course_info.drop_duplicates()

## Step 1: Set up Elasticsearch

In [11]:
es = Elasticsearch("http://localhost:9200")
es.info()

{'name': '08f217c09854',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'z5t6rmgsShCl67KFk2KefQ',
 'version': {'number': '8.7.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '09520b59b6bc1057340b55750186466ea715e30e',
  'build_date': '2023-03-27T16:31:09.816451435Z',
  'build_snapshot': False,
  'lucene_version': '9.5.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [12]:
# creating mappings 
mappings = {
    "properties": {
        'Code': {"type": "text"}, 
        'Department': {"type": "text"}, 
        'Title': {"type": "text"}, 
        'Units': {"type": "text"}, 
        'Description': {"type": "text"}, 
        'Prerequisites': {"type": "text"}, 
    }
}

In [13]:
try:
    es.indices.create(index="courses", mappings=mappings)
except:
    pass

## Step 2: Adding Data into ElasticSearch

In [14]:
# Creating a list of dictionaries with all the data to be added in to the ElasticSearch index
bulk_data = []
for i, row in course_info.iterrows():
    bulk_data.append(
        {
            "_index": "courses",
            "_id": i,
            "_source": {
                "Code": row['Code'],
                'Department': row['Department'], 
                'Title': row['Title'], 
                'Units': row['Units'], 
                'Description': row['Description'], 
                'Prerequisites': row['Prerequisites']
            }
        }
    )

bulk(es, bulk_data)

(7169, [])

In [15]:
# verifying that all data has been read into the python index properly
es.indices.refresh(index="courses")
es.cat.count(index="courses", format="json")

[{'epoch': '1706418215', 'timestamp': '05:03:35', 'count': '7169'}]

## Step 3: Performing Search

In [16]:
def es_search(query, k=10):
    """
    Searches the data using ElasticSearch to find the k most similar documents to the query.
    Returns a list of the k most similar functions, along with their GitHub URLs and their similarity scores to the query
    """

    es_query = {
        "query": {
            "bool": {
                "must": {
                    "query_string": {
                        "query": query,
                        "fields": [
                            'Code',
                            'Department',
                            'Title^1.5',
                            'Description^2', #boost 2x
                            'Prerequisites'
                        ],
                        "phrase_slop": 2  # still considered a match if they are up to two terms apart
                    }
                },
            }
        },
        "size": k
    }
    
    response = es.search(index="courses", body=es_query)
    
    results = []
    # for each result, add the function name, the GitHub URL of the function, and the similarity score to the results list
    for hit in response['hits']['hits']:
        row = hit['_source']
        results.append((row['Code'], row['Title'], hit['_score']))
        
    return results

In [22]:
# example
es_search('introduction to calculus')

  response = es.search(index="courses", body=es_query)


[('SIOB 276', 'Quantitative Theory of Populations and Communities', 19.165262),
 ('MATH 294', 'The Mathematics of Finance', 17.016047),
 ('ECON 220A', 'Econometrics A', 16.614532),
 ('MAE 208', 'Mathematics for Engineers', 15.338562),
 ('ECON 205', 'Mathematics for Economists', 15.333879),
 ('MATH 20A', 'Calculus for Science and Engineering', 14.663015),
 ('MATH 10B', 'Calculus II', 14.53306),
 ('NEUG 240',
  'Mathematical Foundations for Computational Neuroscience',
  13.986247),
 ('CSE 273', 'Computational Photography', 13.376045),
 ('PHYS 110A', 'Mechanics I', 12.993957)]

## Step 4: Creating a Web Server

In [None]:
from flask import Flask, request, jsonify
app = Flask(__name__)

In [None]:
def es_search_print(query):
    print(query)
    return [1, 2, 3, 5]

In [None]:
@app.route('/search', methods=['POST'])

def search():
    query = request.json.get('query')
    results = es_search(query)
    return jsonify(results)

if __name__ == '__main__':
    app.run(debug=True)

In [2]:
%tb

SystemExit: 1

In [1]:
from flask import Flask
app = Flask(__name__)

@app.route('/')
def hello_world():
    return 'Hello, World!'

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
