# Recommender System
## Content-based filtering

In [1]:
import numpy as np
import pandas as pd
import pickle
from nltk.tokenize import word_tokenize
from topic_modeling import prep_text
from sklearn.metrics import pairwise_distances

In [2]:
# Load preprocessed text
infile = open('processed_text.pkl','rb')
text_l = pickle.load(infile)
infile.close()

In [3]:
text = prep_text(text_l)

In [4]:
# Load vectorizer
infile = open('tfidf.pkl','rb')
tfidf = pickle.load(infile)
infile.close()

In [5]:
# Load model
infile = open('nmf_model.pkl','rb')
nmf_model = pickle.load(infile)
infile.close()

In [6]:
# Abstract labels (PMIDs)
infile = open('pmids.pkl','rb')
pmids = pickle.load(infile)
infile.close()

In [7]:
# Topics
topic_names = {0: 'clinical recommendations',
              1: 'pathophysiology',
              2: 'prognosis',
              3: 'children',
              4: 'cancer',
              5: 'mental health',
              6: 'diagnosis/serology',
              7: 'resources & technology',
              8: 'vaccine',
              9: 'predictions/forecasting',
              10: 'transmission',
              11: 'global impact',
              12: 'pregnancy',
              13: 'surgery',
              14: 'treatment',
              15: 'microbiology',
              16: 'education',
              17: 'clinical features',
              18: 'deaths',
              19: 'healthcare workers & PPE'}

# Topic labels
topic_labels = list(topic_names.values())

In [8]:
# Create document-term matrix
doc_word = tfidf.fit_transform(text)

# Convert to dataframe
doc_word_df = pd.DataFrame(doc_word.toarray(), index=pmids, columns=tfidf.get_feature_names())
doc_word_df.head()



Unnamed: 0,abbott,abdomen,abdominal_pain,ability,abnormality,absence,absent,absolute,absorption,abstract,...,yield,youth,zealand,zhejiang,zhejiang_province,zinc,zip_code,zone,zoom,zoonosis
33152797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33152773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33152771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33152743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33152729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Create document-topic matrix
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic_df = pd.DataFrame(doc_topic.round(5), index=pmids, columns=topic_labels)
doc_topic_df.head()



Unnamed: 0,clinical recommendations,pathophysiology,prognosis,children,cancer,mental health,diagnosis/serology,resources & technology,vaccine,predictions/forecasting,transmission,global impact,pregnancy,surgery,treatment,microbiology,education,clinical features,deaths,healthcare workers & PPE
33152797,0.0,0.0,0.03711,0.00023,0.0,0.0,0.0,0.0,0.0,0.0,0.00068,0.0,0.06743,0.0038,0.0,0.0,0.0,0.00854,0.00124,0.00073
33152773,0.00882,0.00016,0.0,0.00027,0.00042,0.0,0.0,0.01005,0.0,0.00311,0.0,0.00276,0.00123,0.0,0.0,0.0,0.00107,0.00682,0.00265,0.0
33152771,0.00445,0.0017,0.0046,0.0,0.00082,0.0,0.00062,0.01005,0.00105,0.00423,0.00385,0.00558,0.0,0.04077,0.00442,0.0,0.0,0.00148,0.0,0.00658
33152743,9e-05,0.00049,0.01045,0.05756,0.00031,0.00193,0.0,0.00532,0.00054,1e-05,0.00226,0.02224,0.0,0.00024,0.0,0.0,0.00504,0.0,0.0,0.0
33152729,0.0,0.0,0.00019,0.0,0.0,0.04259,0.0,0.01168,0.0,0.0,0.0,0.0,0.00113,0.0,0.0,0.0,0.0,0.0008,0.0,0.11183


### Get distances

In [None]:
# Calculate cosine distances
dists = pairwise_distances(doc_topic_df, metric='cosine')
dists.shape

In [28]:
# Re-define indices & columns of array
dists = pd.DataFrame(data=dists, index=doc_word_df.index, columns=doc_word_df.index)

# Preview
dists.iloc[0:5, 0:5]

Unnamed: 0,33152797,33152773,33152771,33152743,33152729
33152797,0.0,0.8827256,0.8987429,0.9175295,0.981585
33152773,0.882726,1.110223e-16,0.7434067,0.8635391,0.934988
33152771,0.898743,0.7434067,1.110223e-16,0.9113495,0.838428
33152743,0.91753,0.8635391,0.9113495,1.110223e-16,0.980711
33152729,0.981585,0.9349881,0.8384279,0.9807115,0.0


In [None]:
# Save/pickle distances


In [58]:
# Test: abstract I am interested in
chosen_abstract = ['32986153']
dists[chosen_abstract].head()

Unnamed: 0,32986153
33152797,0.984622
33152773,0.971178
33152771,0.991794
33152743,0.082001
33152729,0.992679


In [59]:
abstracts_summed = dists[chosen_abstract].sum(axis=1)
abstracts_summed = abstracts_summed.sort_values(ascending=True)
abstracts_summed.head()

32986153    0.000000
32437077    0.000505
32249943    0.000595
32910826    0.000734
32702413    0.000753
dtype: float64

In [60]:
# Filter out the abstracts used as input
mask = ~abstracts_summed.index.isin(chosen_abstract)
ranked_abstracts = abstracts_summed.index[mask]
ranked_abstracts = ranked_abstracts.tolist()
ranked_abstracts[:10]

['32437077',
 '32249943',
 '32910826',
 '32702413',
 '32812796',
 '32719757',
 '32441753',
 '32953347',
 '32372803',
 '32732461']

In [61]:
# Evaluate results
recommendations = ranked_abstracts[:5]
recommendations

['32437077', '32249943', '32910826', '32702413', '32812796']

In [62]:
df_eval = doc_word_df.copy()
df_eval['rec_label'] = np.where(df_eval.index.isin(chosen_abstract), 'Chosen',
                                np.where(df_eval.index.isin(recommendations), 'Recommended', 'Other'))

In [63]:
df_eval[df_eval.rec_label.isin(['Chosen', 'Recommended'])]

Unnamed: 0,abbott,abdomen,abdominal_pain,ability,abnormality,absence,absent,absolute,absorption,abstract,...,youth,zealand,zhejiang,zhejiang_province,zinc,zip_code,zone,zoom,zoonosis,rec_label
32986153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Chosen
32910826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Recommended
32812796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Recommended
32702413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Recommended
32437077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Recommended
32249943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Recommended


In [None]:
# Function
def recommend_abstracts(chosen_abstract, num_recs, metric):
    dists = pairwise_distances(doc_topic_df, metric=metric)
    dists = pd.DataFrame(data=dists, index=doc_word_df.index, columns=doc_word_df.index)
    
    abstracts_summed = dists[chosen_abstract].sum(axis=1)
    abstracts_summed = abstracts_summed.sort_values(ascending=True)
    
    ranked_abstracts = abstracts_summed.index[~abstracts_summed.index.isin(chosen_abstract)]
    ranked_abstracts = ranked_abstracts.tolist()
    
    recommendations = ranked_abstracts[:num_recs]
    print("Recommendations: ")
    print('\t' + '\n\t'.join(recommendations))
    
    df_eval = doc_word_df.copy()

    df_eval['rec_label'] = np.where(df_eval.index.isin(chosen_abstract), 'Chosen',
                                    np.where(df_eval.index.isin(recommendations), 'Recommended',
                                             'Other'))
    
    df_eval = df_eval[df_eval.rec_label.isin(['Chosen', 'Recommended'])]
    
    return df_eval