In [None]:
import os
import json

import pandas as pd
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet

In [None]:
def transform_dt(dt):
    # Adapted from prepare-data (https://github.com/agoldst/dfr-browser/blob/master/bin/prepare-data)
    # topic x doc matrix

    D = len(dt[0])
    p = [0]
    i = []
    x = []
    p_cur = 0
    for topic_docs in dt:
        for d in range(D):
            if topic_docs[d] != 0:
                i.append(int(d))
                x.append(int(topic_docs[d]))
                p_cur += 1
        p.append(int(p_cur))
        
    # x -> weight
    # i -> document
    # p -> bound (p[t+1] - p[t] = # of docs in topic)

    return({ "i": i, "p": p, "x": x })


def get_tw(model, n=50):
    topic_weights = []
    
    for i in range(model.num_topics):
        words, weights = zip(*model.show_topic(i, topn=n))
        topic_weights.append(dict(words=list(words), weights=list(weights)))
        
    alpha = [model.topic_threshold] * model.num_topics

    return dict(alpha=alpha, tw=topic_weights)

In [None]:
def get_top_words(word_topics, topic=None, topn=50):
    topic = word_topics.index if topic is None else topic
    if not hasattr(topic, '__iter__'):
        topic = [topic]

    top_words = pd.DataFrame()
    for i in word_topics.index:
        words = word_topics.loc[i].sort_values(ascending=False)
        words = words.head(topn).reset_index()
        words.columns = ['word', 'weight']
        words['topic'] = i
        words = words[['topic', 'word', 'weight']]

        if top_words.empty:
            top_words = words
        else:
            top_words = pd.concat([top_words, words], axis=0)

    top_words.weight = top_words.weight.astype(int)
    return top_words

# Similarity functions

In [None]:
from sklearn.metrics import euclidean_distances

In [None]:
def print_result_links(docs, id, similarity):
    doc = docs.doclist[docs.doclist['id'] == id][['id', 'title', 'txt_url', 'pdf_url']].iloc[0]
    p = f'id: {doc["id"]} \ntitle: {doc.title} \nurl: {doc.txt_url} \npdf_url: {doc.pdf_url}\n'    
    sim = f'similarity: {100 * similarity:0.2f}%\n' if similarity is not None else ''

    print(p + sim)
    

def close_docs(docs, doc_id, num_docs, report=False, **kwargs):
    dt = kwargs['dt']
    ed = euclidean_distances(dt.loc[doc_id].values.reshape(1, -1), dt.values)
    doc_ids = dt.index[ed.argsort()[0, 0:num_docs]]
    
    if report:
        for doc_id in doc_ids:
            print_result_links(docs, doc_id, None)
            
    return doc_ids