In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, sigmoid_kernel

import numpy as np
import json
import glob

from collections import Counter, defaultdict

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
import nltk
# nltk.download()
from nltk.corpus import stopwords


In [2]:
f_21 = pd.read_csv('assets/f_21_merge.csv')
w_22 = pd.read_csv('assets/w_22_merge.csv')

In [3]:
def prepare_text(data):

    def lemmatization(texts, allowed_postags=["NOUN", "ADJ"]):
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
        texts_out = []
        for text in texts:
            doc = nlp(text)
            new_text = []
            for token in doc:
                if token.pos_ in allowed_postags:
                    new_text.append(token.lemma_)
            #texts_out.append(new_text)
            final = " ".join(new_text)
            texts_out.append(final)
        return (texts_out)

    lemmatized_texts = lemmatization(data)

    def stop_word_removal(texts):

        flat_texts = [t for text in texts for t in text]
        common_words = [k for k,v in Counter(flat_texts).most_common(10)]
        texts_out = []
        stop = set(stopwords.words('english')+common_words)
        for text in texts:
            new_text = [t for t in text if t.lower() not in stop]
            final = " ".join(new_text)
            texts_out.append(final)
        return (texts_out)

    #stop_word_removed_texts = stop_word_removal(lemmatized_texts)


    def gen_words(texts):
        final = []
        for text in texts:
            new = gensim.utils.simple_preprocess(text, deacc=True)
            final.append(new)
        return (final)

    #data_words = gen_words(stop_word_removed_texts)
    data_words = gen_words(lemmatized_texts)


    # Create bigrams and trigrams
    bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    trigram = gensim.models.phrases.Phraser(trigram_phrases)

    def make_bigrams(texts):
        return([bigram[doc] for doc in texts])

    def make_trigrams(texts):
        return ([trigram[bigram[doc]] for doc in texts])

    data_bigrams = make_bigrams(data_words)
    data_bigrams_trigrams = make_trigrams(data_bigrams)
    
    prepared_text = [" ".join(data) for data in data_bigrams_trigrams]

    return prepared_text

In [4]:
# We should process description first and store it in the dataframe to reduce time

def user_input_rec(df, user_text, num_of_rec = 10):

    #clean df
    df = df.fillna('').drop_duplicates(subset=['course']).reset_index().drop(columns='index')
    df = df.reset_index(drop = True)


    # Text into series
    text_series = pd.Series([user_text])

    # Merge description and text series
    texts = df['description'].append(text_series)

    # Process text
    texts = prepare_text(texts)

    # Vectorize our Text
    count_vect = CountVectorizer()
    cv_mat = count_vect.fit_transform(texts)

    df_cv_words = pd.DataFrame(cv_mat.todense(), columns=count_vect.get_feature_names())

    # Cosine Similarity Matrix
    cosine_sim_mat = cosine_similarity(cv_mat)

    # Get Course ID/Index
    course_indices = pd.Series(df.index, index=df['course'])

    # Input index
    scores = list(enumerate(cosine_sim_mat[-1]))

    # Scores
    # Sort Scores
    sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)

    # Recommender
    selected_course_indices = [i[0] for i in sorted_scores[1:]]
    selected_course_scores = [i[1] for i in sorted_scores[1:]]

    result = df[df.columns].iloc[selected_course_indices]

    rec_df = pd.DataFrame(result)

    rec_df['similarity_scores'] = selected_course_scores

    return rec_df[:num_of_rec]

In [None]:
user_input_rec(f_21, user_text = 'African american history', num_of_rec = 10)