## This notebook:
- Try deep learning method on content based filtering


----------------------
# Pipeline

### 1. Read files into dataframe

### 2. oncat_prepare(f_df, w_df)
- Concat f_21, w_21

### 3. store_model(df) - only once for a new dataframe
- Train a SentenceTransformer model
- Save embedder, embeddings, and corpus

### 4. Read the stored embedder, embeddings, and corpus

### 5. dl_content(df, embeddings, corpus, embedder, course_title, k = 10, filter_level = 'subject', semester = 'fall', lsa = None)
- input
    - df: dataset
    - embeddings, corpus, embedder: stored embeddings, stored corpus, stored embedder
    - course_title: input course title
    - k: number of recommendation
    - filter_lever, semester, lsa
- output
    - recommended courses in df

In [8]:
import pandas as pd
import numpy as np
import pickle
import sklearn
import faiss
import spacy
from sentence_transformers import SentenceTransformer
import scipy.spatial

## Deep learning content based filtering

In [33]:
def load_sentence_transformer():
    
    df = pd.read_csv('assets/fw.csv')
    
    corpus = df['text'].tolist()
    
    #Load sentences & embeddings from disc
    with open('corpus_embeddings.pkl', "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_corpus = stored_data['corpus']
        stored_embeddings = stored_data['embeddings']

    with open('embedder.pkl', "rb") as fIn:
        stored_embedder = pickle.load(fIn)
        
    return df, stored_data, stored_corpus, stored_embeddings, stored_embedder, corpus, embedder

In [83]:
def filter_results(rec_df, k, filter_level, lsa):            
    
    cols_to_filter = ['course', 'Term', 'Acad Group', 'Subject', 'Course Title', 'description', 'credits', 'requirements_distribution']

    if filter_level == 'academic_group':
        rec_df = rec_df[rec_df['Acad Group'].isin(input_ag)]

    elif filter_level == 'subject':
        rec_df = rec_df[(rec_df['Subject'].isin(input_sub)) | (rec_df['Course Title'].isin(input_course))]

    req_dis = list(rec_df['requirements_distribution'].unique())

    # Filter the df with lsa
    if lsa == '':
        pass

    elif lsa in req_dis:
        rec_df = rec_df[rec_df['requirements_distribution'] == lsa]
    else:
        # Give error message or no df
        pass
    
    return rec_df[:k]

In [82]:
def dl_content(course_title, k = 10, filter_level = 'subject', semester = 'fall', lsa = None):
    
    # df: dataset
    # embeddings: stored_embeddings
    # corpus: stored_corpus or df['text'].tolist() -- should be the same
    # embedder: stored_embedder
    # course_title = input course title
    # k = number of recommendation
    # filter_level = 'subject', semester = 'fall', lsa = None
    
    df, stored_data, stored_corpus, stored_embeddings, stored_embedder, corpus, embedder = load_sentence_transformer()
    
    # If the len of corpus doesn't match the len of input df text, can't process the rec sys properly. 
    if len(corpus) != len(df['text']):
        print('Stored corpus and the text of the input dataset are different.')
        return None
    
    else:
        
        # Specify valid courses
        valid_courses = df['course'].unique().tolist()

        if course_title not in valid_courses:
            print(f'Please enter a valid course choice. Course {course_title} is not in our list.')
            
        else:

            #input_ag = df.loc[df['course'] == course_title, 'Acad Group'].unique()
            input_sub = df.loc[df['course'] == course_title, 'Subject'].unique()
            input_course = df.loc[df['course'] == course_title, 'Course Title'].unique()
            input_subtitle = df.loc[df['course'] == course_title, 'sub_title'].unique()
            input_des = df.loc[df['course'] == course_title, 'description'].unique()

            query = [' '.join(input_sub + input_course + input_subtitle + input_des)]

            if len(query[0]) == 0:
                print('')
                return 

            d = 768
            index = faiss.IndexFlatL2(d)

            index.add(np.stack(stored_embeddings, axis=0))

            query_embedding = stored_embedder.encode(query)
            D, I = index.search(query_embedding, k)     # actual search

            distances, indices = index.search(np.asarray(query_embedding).reshape(1,768),k)

            print("Input Course:", course_title)
            print("Course Title:", input_course[0])
            print("Query:", query[0])

            rec_df = df.iloc[indices[0],:]
            
            cols_to_filter = ['course', 'Term', 'Acad Group', 'Subject', 'Course Title', 'description', 'credits', 'requirements_distribution']
            
            if 'semester' == None:
                result = filter_results(rec_df, k, filter_level, lsa)[cols_to_filter]
                
            else:
                rec_df = rec_df[rec_df['semester'] == semester]
                result = filter_results(rec_df, k, filter_level, lsa)[cols_to_filter]

            return result

In [84]:
%%time
dl_content('EECS 587', k = 10, filter_level = None, semester = 'winter')

Input Course: EECS 587
Course Title: Parallel Computing
Query: Electrical Engineering And Computer Science (EECS) Open SectionsParallel Computing The development of programs for parallel computers. Basic concepts such as speedup, load balancing, latency, system taxonomies. Design of algorithms for idealized models. Programming on parallel systems such as shared or distributed memory machines, networks. Grid computing. Performance analysis.
CPU times: user 755 ms, sys: 415 ms, total: 1.17 s
Wall time: 893 ms


Unnamed: 0,course,Term,Acad Group,Subject,Course Title,description,credits,requirements_distribution
6110,EECS 570,Winter 2022,Engineering,Electrical Engineering And Computer Science (E...,Parallel Compt Arch,Architectures for explicit parallelism. Multit...,4,BS
6103,EECS 561,Winter 2022,Engineering,Electrical Engineering And Computer Science (E...,Des Dig Cont Sys,Sampling and data reconstruction. Z-transforms...,3,BS
