In [1]:
%env MONGODB_URI=mongodb://localhost:27017/

env: MONGODB_URI=mongodb://localhost:27017/


In [2]:
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

from articles import articles
from tokenizer.tokenize_and_stem import tokenize_and_stem
from tokenizer.remove_html import remove_html
from vectorization import tf_idf
from performance.benchmarks import Benchmarks

In [3]:
def run_pipeline(parameters={}):
    benchmarks = Benchmarks()

    ####################################################
    # 1. get the documents a process them
    ####################################################

    doc_params = parameters['documents']
    documents = articles.get_articles(doc_params['query']).limit(doc_params['limit'])
    article_docs = [document for document in documents]
    benchmarks.add_benchmark('1-get-articles')

    # 1a. get just body documents
    texts = articles.get_document_texts(article_docs)
    titles = articles.get_document_titles(article_docs)
    benchmarks.add_benchmark('2-get-body-documents')

    # 1b. remove html
    texts = [remove_html(text) for text in texts]
    benchmarks.add_benchmark('3-remove-html')

    ####################################################
    # 2. tf_idf
    ####################################################

    vectorizer, matrix = tf_idf.fit_texts(texts, tokenize_and_stem,
                                          parameters['tf_idf'])
    benchmarks.add_benchmark('4-tf-idf')

    ####################################################
    # 3. hierarchical clustering
    # ####################################################

    dist = 1 - cosine_similarity(matrix)

    linkage_matrix = ward(dist)  # define the linkage_matrix using ward clustering pre-computed distances
    benchmarks.add_benchmark('5-hiearchical-clustering')

    fig, ax = plt.subplots(figsize=(15, 20))  # set size
    ax = dendrogram(linkage_matrix, orientation="right", labels=titles)

    plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')

    plt.tight_layout()  # show plot with tight layout

    benchmarks.add_benchmark('6-displaying-result')

    # uncomment below to save figure
    plt.savefig('data/ward_clusters.png', dpi=300)  # save figure as ward_clusters

In [4]:
news_only_query = {'$or': [{'sectionId': 'world'}, {'sectionId': 'uk-news'}]}

parameters = {
    'documents': {
        'query': news_only_query,
        'limit': 10
    },
    'tf_idf': {
        'ngram_range': (1, 2),
        'min_df': 2,
        'max_df': 1.0,
        'max_features': None
    },
    'lda': {
        'num_topics': 20,
        'no_below': 1,
        'no_above': 0.8,
        'update_every': 5,
        'chunksize': 10000,
        'passes': 100
    }
}

run_pipeline(parameters)

{'id': '1-get-articles', 'time': 0.002642000000000033}
{'id': '2-get-body-documents', 'time': 0.00011300000000002974}
{'id': '3-remove-html', 'time': 0.02399600000000013}
{'id': '4-tf-idf', 'time': 0.2814030000000003}
{'id': '5-hiearchical-clustering', 'time': 0.0021100000000000563}
{'id': '6-displaying-result', 'time': 0.15028799999999976}
