## This notebook:
- Try deep learning method on content based filtering


----------------------
### 1. Read files into dataframe

### 2. concat_prepare(f_df, w_df)
- Concat f_21, w_21

### 3. store_model(df) - only once for a new dataframe
- Train a SentenceTransformer model
- Save embedder, embeddings, and corpus

### 4. Read the stored embedder, embeddings, and corpus

### 5. dl_content(df, embeddings, corpus, embedder, course_title, k = 10, filter_level = 'subject', semester = 'fall', lsa = None)
- input
    - df: dataset
    - embeddings, corpus, embedder: stored embeddings, stored corpus, stored embedder
    - course_title: input course title
    - k: number of recommendation
    - filter_lever, semester, lsa
- output
    - recommended courses in df

In [1]:
import pandas as pd
import numpy as np
import pickle
import sklearn
import faiss
import spacy
from sentence_transformers import SentenceTransformer
import scipy.spatial

In [2]:
online = pd.read_csv('assets/original/2021-10-19-MichiganOnline-courses.csv')
f_21 = pd.read_csv('assets/f_21_merge.csv')
w_22 = pd.read_csv('assets/w_22_merge.csv')

In [3]:
def concat_prepare(f_df, w_df):
    f_df['semester'] = 'fall'
    w_df['semester'] = 'winter'
    
    # Concat
    df = pd.concat([f_df, w_df])
    
    # Clean
    df = df.fillna('').drop_duplicates(subset=['course']).reset_index().drop(columns='index')

    # Remove description with no information
    df['description'].replace('(Hybrid, Synchronous)', '', inplace = True)
    
    # Merge all the text data
    df['text'] = df['Subject'] + ' ' \
                + df['Course Title'] + ' ' \
                + df['sub_title'] +' '\
                + df['description']
    
    return df

fw = concat_prepare(f_21, w_22)


In [4]:
def store_model(df):
    corpus = df['text'].tolist()
    embedder = SentenceTransformer('bert-base-nli-mean-tokens')
    corpus_embeddings = embedder.encode(corpus)
    with open('corpus_embeddings.pkl', "wb") as fOut:
        pickle.dump({'corpus': corpus, 'embeddings': corpus_embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
    with open('embedder.pkl', "wb") as fOut:
        pickle.dump(embedder, fOut, protocol=pickle.HIGHEST_PROTOCOL)
#store_model(fw)

## Bert Sentence Transformer 

In [5]:
%%time

#Load sentences & embeddings from disc
with open('corpus_embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_corpus = stored_data['corpus']
    stored_embeddings = stored_data['embeddings']
    
with open('embedder.pkl', "rb") as fIn:
    stored_embedder = pickle.load(fIn)

CPU times: user 293 ms, sys: 770 ms, total: 1.06 s
Wall time: 3.09 s


In [6]:
len(stored_corpus), len(fw['text'].to_list())

(8334, 8334)

## Deep learning content based filtering

In [14]:
def dl_content(df, embeddings, corpus, embedder, course_title, k = 10, filter_level = 'subject', semester = 'fall', lsa = None):
    # df: dataset
    # embeddings: stored_embeddings
    # corpus: stored_corpus or df['text'].tolist() -- should be the same
    # embedder: stored_embedder
    # course_title = input course title
    # k = number of recommendation
    # filter_level = 'subject', semester = 'fall', lsa = None
    
    
    # If the len of corpus doesn't match the len of input df text, can't process the rec sys properly. 
    if len(corpus) != len(df['text']):
        print('Stored corpus and the text of the input dataset are different.')
        return None
    
    else:
        input_ag = df.loc[df['course'] == course_title, 'Acad Group'].unique()
        input_sub = df.loc[df['course'] == course_title, 'Subject'].unique()
        input_course = df.loc[df['course'] == course_title, 'Course Title'].unique()
        input_subtitle = df.loc[df['course'] == course_title, 'sub_title'].unique()
        input_des = df.loc[df['course'] == course_title, 'description'].unique()
        
        query = [' '.join(input_sub + input_course + input_subtitle + input_des)]
        
        if len(query[0]) == 0:
            print('No text information was provided for the recommender system')
            return None
        

        d = 768
        index = faiss.IndexFlatL2(d)

        index.add(np.stack(embeddings, axis=0))

        query_embedding = embedder.encode(query)
        D, I = index.search(query_embedding, k)     # actual search


        distances, indices = index.search(np.asarray(query_embedding).reshape(1,768),k)

        #print("Query:", query)

        rec_df = df.iloc[indices[0],:]
        
                
        # Filter the df
        
        # Filter df with semester 
        if semester in ['fall', 'winter']:
            df = df[df['semester'] == semester]
        else:
            pass
        
        # Filter df with acad_group
        if filter_level == 'academic_group':
            rec_df = rec_df[rec_df['Acad Group'].isin(input_ag)] 
        elif filter_level == 'subject':
            rec_df = rec_df[(rec_df['Subject'].isin(input_sub)) | (rec_df['Course Title'].isin(input_course))]
        else:
            pass

        req_dis = list(rec_df['requirements_distribution'].unique())

        # Filter the df with lsa
        if lsa in req_dis:
            rec_df = rec_df[rec_df['requirements_distribution'] == lsa]
        else:
            # Give error message or no df
            pass
        
        return rec_df[:k]

In [9]:
%%time
dl_content(fw, stored_embeddings, stored_corpus, stored_embedder, 'EECS 587', k = 10, filter_level = None, semester = 'fall', lsa = 'BS')


Query: ['Electrical Engineering And Computer Science (EECS) Open SectionsParallel ComputingThe development of programs for parallel computers. Basic concepts such as speedup, load balancing, latency, system taxonomies. Design of algorithms for idealized models. Programming on parallel systems such as shared or distributed memory machines, networks. Grid computing. Performance analysis.']
CPU times: user 621 ms, sys: 880 ms, total: 1.5 s
Wall time: 2.72 s


Unnamed: 0,Class Nbr,course,Term,Session,Acad Group,Subject,Course Title,description,Component,Time,...,Units,sub_title,credits,requirements_distribution,consent,advisory_prerequisites,other_course_info,repeatability,semester,text
1511,23751,EECS 587,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Parallel Computing,The development of programs for parallel compu...,LAB,12-1PM,...,4.0,,4,BS,,EECS 281; graduate standing.,F.,May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
6110,19925,EECS 570,Winter 2022,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Parallel Compt Arch,Architectures for explicit parallelism. Multit...,LEC,130-230PM,...,4.0,,4,BS,,EECS 470.,F.,May not be repeated for credit.,winter,Electrical Engineering And Computer Science (E...
1473,30539,EECS 478,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Logic Ckt Syn&Opt,Advanced design of logic circuits. Technology ...,DIS,1130-1230PM,...,4.0,,4,BS,,,"F, W.",May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
1462,27445,EECS 445,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Intro Machine Learn,Theory and implementation of state of the art ...,LEC,130-3PM,...,4.0,,4,BS,,STATS 250 or equivalent.,,May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
6103,17977,EECS 561,Winter 2022,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Des Dig Cont Sys,Sampling and data reconstruction. Z-transforms...,LEC,1030-12PM,...,3.0,,3,BS,,EECS 460 or ME 461.,,May not be repeated for credit.,winter,Electrical Engineering And Computer Science (E...
1437,16200,EECS 312,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Digit Integrat Circ,Review of MOSFET device operation. Design of d...,LEC,130-3PM,...,4.0,,4,BS,,,(non-LSA).,May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
1425,27701,EECS 201,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Comp Sci Pragmatics,Essential tools for computer programming: She...,LEC,1030-12PM,...,1.0,,1,BS,,,,May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
1467,15844,EECS 461,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Embedded Control,Basic interdisciplinary concepts needed to imp...,LEC,12-130PM,...,4.0,,4,BS,,,(non-LSA).,May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...


In [15]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore

courses = fw['course'].unique()

def calc_topic_coherence(df):
    def gen_words(texts):
        final = []
        for text in texts:
            new = gensim.utils.simple_preprocess(text, deacc=True)
            final.append(new)
        return (final)
    
    texts = gen_words(df['description'])
    
    num_topics = 1
    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]
    
    try:
        model = LdaMulticore(corpus=corpus,id2word = id2word, num_topics = num_topics, alpha=.1, eta=0.1, random_state = 42)
        #print('Model created')
        coherencemodel = CoherenceModel(model = model, texts = texts, dictionary = id2word, coherence = 'c_v')
        #print("Topic coherence: ",coherencemodel.get_coherence())
        coherence_value = coherencemodel.get_coherence()
    except:
        coherence_value = None
    return coherence_value

def coh(func):
    coh_val = []
    i = 0
    while i <100:
        input_course = np.random.choice(courses, 1)[0]
        rec_df = func(fw, stored_embeddings, stored_corpus, stored_embedder, input_course, k = 10, filter_level = None, semester = '', lsa = '')
        rec_df = rec_df.append(fw[fw['course'] == input_course])
        rec_df['description'] = rec_df['description'].fillna('').astype(str)
        val = calc_topic_coherence(rec_df)
        if val != None:
            coh_val.append(val)
            i+=1

    avg_coh_sk = np.average(coh_val)
    return avg_coh_sk

In [16]:
%%time
dl_coh = coh(dl_content)

CPU times: user 2min 39s, sys: 26.1 s, total: 3min 5s
Wall time: 7min 44s


In [17]:
dl_coh

0.42304488996551476