In [None]:
import numpy as np
import pandas as pd
from utils import read
from search import helpers

from sklearn.feature_extraction.text import TfidfVectorizer
# Note that the tf-idf functionality in sklearn.feature_extraction.text can produce
# normalized vectors, in which case cosine_similarity is equivalent to linear_kernel, only slower.
# https://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
from sklearn.metrics.pairwise import linear_kernel

%reload_ext autoreload
%autoreload 2

In [None]:
courses_text = read.read_json_processed('courses-text')
slug_keywords = pd.DataFrame(courses_text).T[['summary', 'content', 'keywords']]
# the index is the course slug
slug_keywords.index.name = 'slug'
slug_keywords.reset_index(inplace=True)

courses = read.read_json_processed('courses')
slug_code_name = pd.DataFrame(courses)[['slug', 'code', 'name']]

df = pd.merge(slug_keywords, slug_code_name)

df['token_string'] = df[['name', 'content', 'summary', 'keywords']].apply(lambda row: row.str.cat(sep=' '), axis=1).str.strip()
# df['token_string'] = df.search_string.str.strip()
df['token_string_with_slug'] = df.token_string.str.cat(df.slug, sep=' ')

In [None]:
tfidf = TfidfVectorizer(analyzer='word', preprocessor=helpers.clean_text)
features = tfidf.fit_transform(df.token_string)
features_withslug = tfidf.fit_transform(df.token_string_with_slug)

In [None]:
courseindex = pd.Series(df.slug, name=None)
helpers.save_df(courseindex, 'index')

In [None]:
cos = linear_kernel(features)
# zero elements above (and including) the diagonal 
# --> no duplicates or self-similarities
cosdf = pd.DataFrame(np.tril(cos, k=-1)).round(2)
# np.fill_diagonal(cosdf.values, 0)

In [None]:
for i in range(20, 100, 5):
    sim_threshold = i/100
    cosdf[cosdf < sim_threshold] = 0
    scosdf = cosdf.astype(pd.SparseDtype('float', fill_value=0))
    print(f'Similarity threshold: {sim_threshold} --- Sparse matrix density: {scosdf.sparse.density}')
    filename = f'sim{i}'
    helpers.save_sparsemat(scosdf.sparse.to_coo(), filename)

In [None]:
helpers.save_df(df, 'courses_df')
# include slug in features for query engine (enables search by course code)
helpers.save_sparsemat(features_withslug, 'features')
helpers.save_pickle(tfidf, 'tfidf')