### Using pyLDAvis topic visualization with non-negative factorization model (NMF)
* input previously obtained corpus of newspaper articles
* perform basic document text preprocessing
* vectorize the corpus using term frequency vectorizer
* fit a non-negative matrix fractorization model for topic extraction
* munge NMF data components for input to pyLDAvis interactive topic visulation

Author: Tom Borgstadt
<br>Date: 12-17-2015</br>


In [1]:
import numpy as np 
import pandas as pd
from pandas import Series, DataFrame

import json
import datetime


In [2]:
def lower_case(List_of_Strings):

    z = [string.lower() for string in List_of_Strings]
    
    return(z)


In [3]:
def remove_spec_chars(List_of_Strs):
    import re
    
    z = []
    for string in List_of_Strs:
        newstr = string
        newstr = ''.join([i if ord(i) < 128 else ' ' for i in newstr]) # remove non-ascii chars
        newstr = newstr.replace("\n", " ")                             # linux new line
        newstr = re.sub(' +',' ', newstr)                         # remove extra whitespace
        newstr = re.sub(r'([a-z])-([a-z])', r'\1\2', newstr)      # remove hyphens in hyphenated words
        newstr = re.sub("[!@#$?&:;,_().]", '', newstr)            # remove general special characters
        newstr = re.sub('["]', '', newstr)                        # remove double ticks
        newstr = re.sub(r"http\S+", "", newstr)                   # remove http links
       
        z.append(newstr)
    
    return(z)
    

In [4]:
def remove_stop_words(List_of_Strings):
    from sklearn.feature_extraction import text
    
    global cachedStopWords
    
    skl_stopwords = text.ENGLISH_STOP_WORDS
    cachedStopWords = list(skl_stopwords) + ['00','000','01','02','03','04','05','06','07','08','09','10','100',
            '11', '12', '13', '14', '15', '16', '17', '18','19','20','21','22','23','24','25','26','27','28','29',
            '30','31','345','38','40','44','45','50','65','75','76','78','646','713','800','853','865','999','2000','2001',
            '2002','2003','2004','2005','2006','2007','2008','advertisement','maria','mr','photo','sheahan', 'reading',
            'continue','000270ks','005380ks','mike','ingram','english','112','124','136','150','189','200','main','story',
            '1000','10000','100000','101','1970s','1980s','1990','1990s','1998','1999','20000','2009','2010','2011','2012',
            '2013','2014','2015','288','300','002','004','005','005930ks','006400ks','009150ks','011760ks','066570ks',
            '007','008','010','0130','015','0151','025','027','031','035','042','0500','056','060','061','06564','06989',
            '32','33','34','35','350','36','39','41','42','43','47','48','500','51','52','53','54','55','60','600','6000',
            '600000','62','63','64','65','67','68','69','70','5000','50000','500000','700','7000','7203t','73','80','85',
            '86','87','88','89','90','900','90000','91','92','93','94','95','96','97','98','99','said','did']
    
    z = []
    z = [' '.join([word for word in string.split() if word not in cachedStopWords]) for string in List_of_Strings]
    
    return(z)


In [5]:
def cosine_similarity(v1,v2):
    from math import sqrt
    
    # compute cosine similarity of v1 to v2: (v1 dot v1)/(||v1||*||v2||)'
    # the L2-normalized dot product of vectors
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]
        y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y

    return(sumxy/sqrt(sumxx*sumyy))


In [6]:
def replace_dict_values(List_of_Strings):
    import re
    
    d = {'diesels':'diesel',
         'dieselpowered':'diesel',
         'automobile':'auto',
         'testing':'test',
         'chinese':'china',
         'germany':'german',
         'korean':'korea',
         'cars':'car',
         'tests':'test',
         'australian':'australia',
         'beijing':'china'}
    
    z = []
    for s in List_of_Strings:
        pattern = re.compile(r'\b(' + '|'.join(d.keys()) + r')\b')
        z.append(pattern.sub(lambda x: d[x.group()], s))
    
    return(z)


## Main Line

In [10]:
pathname = "/home/tom/gdrive/data science pgm/6304 Text Mining/project/data-volkswagen-diesel/"
begin_date = '20150920'

pathnameout = "/home/tom/gdrive/data science pgm/6304 Text Mining/project/"

# Input articles
articles_nytimes = pathname + "articles.text.ny.csv"
articles_reuters = pathname + "articles.text.others.csv"

# Load staging area (2 csv input files)
staged_df = pd.read_csv(articles_nytimes).append(pd.read_csv(articles_reuters), ignore_index=True)
print("\nnumber of staged articles: " + str(len(staged_df)))

# Load articles
staged_df['pub_date'] = pd.to_datetime(staged_df['pub_date'])
articles_df = staged_df.loc[staged_df.pub_date >= begin_date]
print("number of articles:        " + str(len(articles_df)))

# Take articles to a list of documents
corpus = list(articles_df['text'])
print("number of documents:       " + str(len(corpus)))


number of staged articles: 610
number of articles:        578
number of documents:       578


In [11]:
# Preprocessing
corpus = lower_case(corpus)
corpus = remove_stop_words(corpus)## Main Line
corpus = remove_spec_chars(corpus)
corpus = replace_dict_values(corpus)


In [12]:
############################
# Vectorize corpus
############################
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer(binary = False,
                    use_idf = False,
                    norm = False,
                    lowercase = True, 
                    stop_words = cachedStopWords, 
                    min_df = .015,
                    max_df = .750,
                    ngram_range = (1,2))

v_model = v.fit_transform(corpus)
v_model_array = v_model.toarray()
v_model_features = v.get_feature_names()

################################################################
# Generate topics using Non-negative Matrix Factorization (NMF)
################################################################
from sklearn.decomposition import NMF

n_topics = 10
n_top_words = 5

nmf = NMF(n_components=n_topics, random_state=1).fit(v_model)

# extract and list topics each with top words
topic_vectors =[]
print '\n------------ NMF Topics with Top Words --------------\n'
# sort topics, pull out the indices then relate the topics back
for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print("|".join([v_model_features[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    topic_vectors.append(topic.argsort()[:-n_top_words - 1:-1])

# confirm feature counts of vector and extracted topics are the same
if (int(v_model.shape[1]) != int(nmf.components_.shape[1])):
    raise ValueError('Error - number of document features do not equal topic features')



------------ NMF Topics with Top Words --------------

Topic #0:
company|winterkorn|executive|board|chief
Topic #1:
test|european|europe|road|new
Topic #2:
rise|level|years|told|world
Topic #3:
german|scandal|industry|berlin|europe
Topic #4:
vw|billion|euros|scandal|billion euros
Topic #5:
sales|percent|year|vehicles|month
Topic #6:
china|data|carbon|energy|european
Topic #7:
vehicles|states|united states|united|engines
Topic #8:
software|epa|defeat|regulators|devices
Topic #9:
electric|vehicles|technology|electric car|vehicle


In [13]:
################################################
# Prepare data from NMF and Tfidf for pyLDAvis
################################################

# topic_term_dists
topic_term_dists = nmf.components_  # from NMF model                 

# normalize topic term vectors required by pyLDAvis
for topic_idx, topic in enumerate(topic_term_dists): 
    topic_term_dists[topic_idx] = topic_term_dists[topic_idx]/np.linalg.norm(topic_term_dists[topic_idx], ord=1)

if topic_term_dists.shape[0] != int(pd.DataFrame(topic_term_dists).sum(axis=1).map(lambda x: round(x, 2)).sum()):
    raise ValueError('Error - problem with topic term vector normalization')

# doc_topic_dists
doc_topic = []

# iterate through all docs in tfidf vector
for doc_idx, doc in enumerate(v_model_array):
    doc_topic_sims = []
    
    # iterate through all topics from NMF
    for topic_idx, topic in enumerate(nmf.components_):  
        
        # calculate similarity of topic to document
        sim_value = cosine_similarity(doc, topic)        
        doc_topic_sims.append(sim_value)

    doc_topic.append(doc_topic_sims)
    
doc_topic_dists = np.array(doc_topic)

# normalize topic features for each doc required by pyLDAvis
for doc_idx, doc in enumerate(doc_topic_dists):
    doc_topic_dists[doc_idx] = doc_topic_dists[doc_idx]/np.linalg.norm(doc_topic_dists[doc_idx], ord=1)

if doc_topic_dists.sum() != len(v_model_array):
    raise ValueError('Error - problem with document topic vector normalization')

# doc_lengths
doc_lengths = np.sum(v_model_array, axis=1)
    
# vocabulary
vocabulary = np.array(v_model_features)

# term_frequency
term_frequency = np.sum(v_model_array, axis=0)

pyLDAvis_data = {'topic_term_dists': topic_term_dists, 
                 'doc_topic_dists':  doc_topic_dists,
                 'doc_lengths':      doc_lengths,
                 'vocab':            vocabulary,
                 'term_frequency':   term_frequency}
import pyLDAvis
pyLDAvis.enable_notebook()

vis_data = pyLDAvis.prepare(**pyLDAvis_data)


  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):


## NMF Topic Visualization with pyLDAvis

In [14]:
pyLDAvis.display(vis_data)


  def _ipython_display_formatter_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _deferred_printers_default(self):
