Explore TF-IDF embeddings for each item from library

In [None]:
import sys
sys.path.append("./../") # To include ZoteroDB

from zoterodb import ZoteroDB
import pprint as pp

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Function to create summary from library
def build_summary(library):
    summary = {}
    for id, info in library.items():
        _title = info.get('title', None)
        _abstract = info.get('abstractNote', None)
        if _title and _abstract:
            summary[id] = _title + '; ' + _abstract

    return summary

In [None]:
datadir = "./../data/"

In [None]:
zotdb = ZoteroDB(datadir)

library = zotdb.get_library()

In [None]:
summary = build_summary(library)

In [None]:
# Create TF-IDF embedding for library items
mapper = TfidfVectorizer(
    input='content',
    strip_accents='unicode',
    lowercase=True,
    analyzer='word',
    stop_words='english',
    max_df=0.25,
    min_df=10,
    norm='l2',
    use_idf=True
)

library_data = mapper.fit_transform(summary.values())

In [None]:
word_index_mapping = mapper.vocabulary_
index_word_mapping = {v: k for k, v in word_index_mapping.items() }

In [None]:
# Pick a random sample
i = np.random.randint(0, len(summary))
_, feature_idx = library_data[i, :].nonzero() # all features of item index i

# Print summary of document
docid = list(summary.keys())[i]
pp.PrettyPrinter(indent=2).pprint(library[docid])

# Print out features
for fidx in feature_idx:
    print("{:20s}: {:.6f}".format(index_word_mapping[fidx], library_data[i, fidx]))