In [1]:
import pandas as pd
import numpy as np
import sklearn
import faiss

import spacy

# nlp = spacy.load('en_core_web_md')

from sentence_transformers import SentenceTransformer
import scipy.spatial

import pickle

In [2]:
def load_df(course_type):
    # course_type: credit course or online course
    # credit f21: take in free_text_f_21.csv
    # credit w22: take in free_text_w_22.csv
    # online course: take in online df
    
    if course_type == 'Fall 2021':
        df = pd.read_csv('assets/free_text_f_21.csv')
        corpus = df['text'].tolist()
        corpus_embeddings_file = 'corpus_embeddings_f_21.pkl'
        embedder_file = 'embedder_f_21.pkl'
        
    elif course_type == 'Winter 2022':
        df = pd.read_csv('assets/free_text_w_22.csv')
        corpus = df['text'].tolist()
        corpus_embeddings_file = 'corpus_embeddings_w_22.pkl'
        embedder_file = 'embedder_w_22.pkl'
        
    elif course_type == 'online':
        df = pd.read_csv('assets/original/2021-10-19-MichiganOnline-courses.csv')
        corpus = df['description'].tolist()
        corpus_embeddings_file = 'corpus_embeddings_online.pkl'
        embedder_file = 'embedder_online.pkl'
        
    return corpus_embeddings_file, embedder_file, corpus

In [3]:
def load_sentence_transformer(course_type):
    # course_type: credit course or online course
    # credit course: take in free_text_fw.csv
    # online course: take in online df
    
    corpus_embeddings_file, embedder_file, corpus = load_df(course_type)

    #Load sentences & embeddings from disc
    with open(corpus_embeddings_file, "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_corpus = stored_data['corpus']
        stored_embeddings = stored_data['embeddings']

    with open(embedder_file, "rb") as fIn:
        stored_embedder = pickle.load(fIn)
        
    return stored_data, stored_corpus, stored_embeddings, stored_embedder, corpus

In [4]:
def recommender(course_type, queries, k):
    # query free text input from user about his interest
    # we want to see k nearest neighbors of query
    
    stored_data, stored_corpus, stored_embeddings, stored_embedder, corpus = load_sentence_transformer(course_type)
    
    d= 768
    index = faiss.IndexFlatL2(d)
    index.add(np.stack(stored_embeddings, axis=0))   

    query_embeddings = stored_embedder.encode(queries)
    D, I = index.search(query_embeddings, k)     # actual search

    for query, query_embedding in zip(queries, query_embeddings):
        distances, indices = index.search(np.asarray(query_embedding).reshape(1,768),k)
        print("\n======================\n")
        print("Query:", query)
        print("\nTop 5 most similar sentences in corpus:")
        for idx in range(0,k):
            print(corpus[indices[0,idx]], "(Distance: %.4f)" % distances[0,idx])

In [5]:
queries =["I am interested in computer science",
         "I like pop music", 
        "I like Asian culture, especially Janpanese history", 
        "I like to use computer skill to resolve biological problems"]
recommender('Fall 2021', queries, 5)

FileNotFoundError: [Errno 2] No such file or directory: 'assets/free_text_f_21.csv'

In [None]:
def recommander_(df, queries, k):
    # query free text input from user about his interest
    # we want to see k nearest neighbors of query
    # df could be f_21, w_22

    # return: 
    # - a list of dataframes corresponding to each of the queries, 
    # - a list of distances corresponding to each of the queries, could be used for vis 

    df_1 = df[['Class Nbr', 'course', 'Course Title','description']]
    query_embeddings = embedder.encode(queries)
    D, I = index.search(query_embeddings, k)     # actual search

    res = [df_1.iloc[I[i]] for i in range(len(queries))]

    l_dis = []
    for query, query_embedding in zip(queries, query_embeddings):
        distances, indices = index.search(np.asarray(query_embedding).reshape(1,768),k)        
        l_dis.append([distances[0,idx] for idx in range(0,k)])

    return(res, l_dis)

In [None]:
queries =["I am interested in computer science",
         "I like pop music"]

dfs = recommander_(f_21, queries, 5)[0]
dis = recommander_(f_21, queries, 5)[1]

In [None]:
queries =["I am interested in computer science",
         "I like pop music"]

# list of dfs 
dfs = recommander_(f_21, queries, 5)[0]

# list of distance 
dis = recommander_(f_21, queries, 5)[1]

In [None]:
dfs[1]

In [None]:
import matplotlib.pyplot as plt
def vis(df, dis, title):
    fig, ax = plt.subplots()

    ax.set_xticks(range(len(dis)))
    ax.set_xticklabels(list(df['Course Title']), minor=False, rotation=45)
    ax.plot(dis, linewidth=2.0)
    plt.title('Distance to'+' '+ "'"+ title +"'")

    plt.show()

In [None]:
vis(dfs[1], dis[1], queries[0])