In [34]:
import pandas as pd
import string
import numpy as np
import nltk
import pickle
import torch
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\misaf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [78]:
data = pd.read_csv("data\cleaned_course_catalog.csv")
del data[data.columns[0]]
data.head()

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ..."
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...


In [79]:
#checking null values
data[data.isna().any(axis=1)]

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites
796,CHEM 299,CHEM,Research in Chemistry,1–12,,graduate-student standing and consent of instr...
1927,BENG 296,BENG,Independent Study,4,,"consent of instructor. (F,W,S)"
2760,SE 296,SE,Independent Study,4,,consent of instructor.
3799,JAPN 180,JAPN,Special Topics in Japanese Studies,4,,none
4753,Electives,,Varies,12,Students enroll in twelve elective units of co...,none
4774,MATS 296,MATS,Independent Study,4,,consent of instructor.
5187,NEU 298,NEU,Neurosciences Independent Study Project (ISP),1–12,,"approved ISP proposal. (F,W,S)"
5474,PHYS 258,PHYS,Astrophysics and Space Physics Special Topics ...,0–1,,none
5634,POLI 132,POLI,Political Development and Modern China,4,,none


In [80]:
#dropping null values
data = data.dropna().reset_index(drop=True)
data

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ..."
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...
...,...,...,...,...,...,...
7156,WCWP 100,WCWP,Academic Writing,4,An upper-division workshop course in argumenta...,junior/senior standing and must be a Warren Co...
7157,WCWP 160,WCWP,Technical Writing for Scientists and Engineers,4,An upper-division workshop-style writing cours...,junior/senior standing.
7158,WARR 189,WARR,Academic Mentoring and the Writing Process,2,Students will gain a fundamental understanding...,permission of instructor is required to enroll.
7159,WCWP 198,WCWP,Group Study,2,A directed group study involving research and ...,none


In [6]:
# """
# Pre-processing of descriptions include:
# - lowercase
# - remove punctuation
# - tokenization
# - lemmatization
# - (optional) remove stopwords
# """

In [7]:
def tokenize(text):
    # tokenize
    tokens = text.split(" ")

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

In [24]:
# processing description
course_desc = data['Description'].copy()
course_desc

# lowercase
course_desc = course_desc.str.lower()

# remove punctuation
punc_to_remove = '[' + string.punctuation + ']'
course_desc = course_desc.str.replace(punc_to_remove, '', regex=True)

# description_tokens = course_desc.apply(tokenize)

In [96]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v4')

# description_embeddings = model.encode(course_desc)
# pickle.dump(description_embeddings, open('description_tokens.pkl', 'wb'))

with open("description_tokens.pkl", "rb") as f:
    description_tokens = pickle.load(f)

In [97]:
# turn into tensors for faster computations
description_tensors = [torch.tensor(a) for a in description_embeddings];

# will be identifying tensors based on their index in the original tokens list
desc_tensor_dict = dict(zip(range(len(description_tensors)), description_tensors))

In [101]:
def cos_sim(q_tensor, tensor_dict):
    scores = {}
    for id, tensor in tensor_dict.items():
        magnitude_A = q_tensor.norm()
        magnitude_B = tensor.norm()
        similarity = torch.dot(q_tensor.squeeze(), tensor) / (magnitude_A * magnitude_B)
        # only output scores that are high enough
        if similarity >= 0.2:
            scores[id] = similarity.item()
    return scores

In [87]:
def parse_query(q):
    encoded = model.encode(q, convert_to_tensor=True)
    return encoded

In [94]:
def search(q):
    # returns query, language, identifer(optional), url
    parsed = parse_query(q)

    # description score
    docu_scores = cos_sim(parsed, desc_tensor_dict) 
    docu_scores_sorted = {k:v for k,v in sorted(docu_scores.items(), key=lambda item:item[1], reverse=True)}
    
    # # code score
    # code_scores = cos_sim(parsed, code_tokens)
    # code_scores_sorted = {k:v for k,v in sorted(code_scores.items(), key=lambda item:item[1], reverse=True)}
    
    # overall_scores = {**code_scores_sorted, **docu_scores_sorted}
    # sorted_overall_scores = sorted(overall_scores.items(), key=lambda x: x[1], reverse=True)
    
    out = []
    ids = []
    for (id, score) in docu_scores_sorted.items():
        if id not in ids:
            out.append({'query':q, 'course code':data.iloc[id]['Code'], 'course title':data.iloc[id]['Title']})
            # out.append([q, language, train[id]['func_name'], train[id]['func_code_url']])
            ids.append(id)

    print(f"{len(out)} results")   
    return out

In [104]:
search('lost languages')

102 results


[{'query': 'lost languages',
  'course code': 'LIGN 5',
  'course title': 'The Linguistics of Invented Languages'},
 {'query': 'lost languages',
  'course code': 'COGS 156',
  'course title': 'Language Development'},
 {'query': 'lost languages',
  'course code': 'LIGN 279',
  'course title': 'Topics in Language Acquisition'},
 {'query': 'lost languages',
  'course code': 'LIGN 155',
  'course title': 'Evolution of Language'},
 {'query': 'lost languages',
  'course code': 'PSYC 244',
  'course title': 'Special Topics in Psycholinguistics'},
 {'query': 'lost languages',
  'course code': 'LIGN 255',
  'course title': 'Topics in Language Evolution'},
 {'query': 'lost languages',
  'course code': 'COMM 112G',
  'course title': 'IM: Language and Globalization'},
 {'query': 'lost languages',
  'course code': 'ANSC 116',
  'course title': 'Languages of the Americas: Mayan'},
 {'query': 'lost languages',
  'course code': 'ANSC 194',
  'course title': 'Language, Migration, Borders'},
 {'query': 