One vs All Method

Train NMF for each topic separately.

Use all Wiki articles as Background Corpus.

In [1]:
import pandas as pd
import numpy as np
from time import time

import nltk

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF

import matplotlib.pyplot as plt
from math import pi

from omterms.interface import *

import pickle

from ipywidgets import interact, fixed
from IPython.display import display

import requests
import json

import re

import requests
from bs4 import BeautifulSoup

import libs.synonyms as syn
import libs.text_preprocess as tp

import warnings
warnings.filterwarnings("ignore")



## Plots and Prints

In [2]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']

def plot_radar_chart(doc_topic_cumul, doc, doc_names):
    # ------- PART 1: Create background
 
    # number of variablecategories
    
    
    schwartz_dist = []
    for sch in schwartz:
        schwartz_dist.append(doc_topic_cumul[doc][categories.index(sch)])
    
    N = len(schwartz)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    plt.figure(figsize=(8,8))
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)

    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], schwartz)

    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([25,50,75], ["25","50","75"], color="grey", size=7)
    plt.ylim(0,100)


    # ------- PART 2: Add plots

    # Plot each individual = each line of the data
    # I don't do a loop, because plotting more than 3 groups makes the chart unreadable

    # Ind1
    values = list(schwartz_dist) + list(schwartz_dist[:1])
    ax.plot(angles, values, linewidth=1, linestyle='solid')
    ax.fill(angles, values, 'b', alpha=0.1)

    # Add legend
    #plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title("Schwartz Chart - " + doc_names[doc])
    plt.savefig("Schwartz_Chart_" + str(doc))
    plt.show()
    
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
    
def print_top_words(model, theme, tfidf_vectorizer, n_top_words, n_topics=3):
    feature_names = tfidf_vectorizer.get_feature_names()
    print(color.CYAN + color.BOLD + categories[theme] + color.END)
    for topic_idx, topic in enumerate(model[theme].components_):
        if topic_idx / n_topics == 1:
            break
        message = color.BOLD + "Topic #%d: " % topic_idx + color.END
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def print_cumulative_train_doc_topics(data, doc_topic, doc, n_best):
    test_theme = data.iloc[doc]['theme']
    print(color.BOLD + "Doc " + str(doc) + color.RED +  " (" + test_theme + ")\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()
    
def print_cumulative_test_doc_topics(W_test_norms, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    n_topics = W_test_norms.shape[2]
    dt = W_test_norms[:,doc,:].flatten()
    for i in dt.argsort()[:n_best - 1:-1]:
        print("(", end='')

        print(color.CYAN + color.BOLD + categories[i//n_topics] + color.END, end='')

        print(" (%d), %.2lf)  " %(i%n_topics, dt[i]), end='')    
    print()

def print_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    for i in doc_topic[doc].argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i//3] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, doc_topic[doc][i]), end='')    
    print()

def print_train_results(doc_topic, doc, corpus, data):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_train_doc_topics(data, doc_topic, doc, 11) 
    print()
    
    plot_radar_chart(doc_topic, doc)
    
    
def print_test_results(doc, W_test_high, W_test_norms, tfidf_test, pre_nmf_list, pre_tfidf_vectorizer, word_topic_scores, word_topic_sp,
                       doc_names, pre_trained_doc, purity_score, word_count, only_doc_words):
    print(color.BOLD + "Document " + str(doc) + ": " + doc_names[doc] + color.END)
    #print()
    #print(color.BOLD + "Text: " + color.END)
    #print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_test_doc_topics(W_test_norms, doc, 11)
    print()
    
    plot_radar_chart(W_test_high, doc, doc_names)
    print()
    
    df_scores = schwartz_word_scores(W_test_norms[:,doc,:], tfidf_test[doc], word_topic_scores[:,:,doc,:], word_topic_sp[:,:,doc,:], pre_tfidf_vectorizer, purity_score, word_count, only_doc_words)    
    
    display(df_scores)
    

## Helper Functions

In [3]:
def cumulate_W(W, n_topics):
    W_cumul = []
    for d in W:
        temp = []
        for i in range(W.shape[1]//n_topics):
            temp.append(d[i*n_topics:(i+1)*n_topics].sum())
        W_cumul.append(temp)

    W_cumul = np.asarray(W_cumul)
    
    return W_cumul

def normalize_W(W):
    W_cumul_norm = W/(W.sum(axis=1).reshape(W.shape[0], 1))
    W_cumul_norm *= 100
    
    return W_cumul_norm

def prepare_export(W, docs, doc_names, filepath):
    schwartz_dist = []
    for doc in range(len(docs)):
        temp_dist = []
        for sch in schwartz:
            temp_dist.append(W[doc][categories.index(sch)])
        schwartz_dist.append(temp_dist)
    schwartz_dist = np.asarray(schwartz_dist)
    
    df = pd.DataFrame(data=schwartz_dist,index = range(len(schwartz_dist)), columns=schwartz)
    df['Text'] = docs
    df["name"] = doc_names
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    
    return df
    
def export_to_excel(W, docs, doc_names, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put xlsx as file extension '''
    
    df = prepare_export(W, docs, doc_names, filepath)
    df.to_excel(filepath)
    return df

def export_to_csv(W, docs, doc_names, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put csv as file extension '''
    
    df = prepare_export(W, docs, doc_names, filepath)
    df.to_csv(filepath)
    return df

def export_word_scores_excel(W_test_norms, W_test_list, tfidf_test, doc_names, pre_trained_doc, filepath, purity_score=False, word_count=10, only_doc_words=False):
    writer = pd.ExcelWriter(filepath, engine = 'xlsxwriter')
    
    pre_nmf_list, pre_tfidf_vectorizer = pickle.load( open( pre_trained_doc, "rb" ) )
    word_topic_scores, word_topic_sp = calculate_word_topic_scores(pre_nmf_list, W_test_list)
    
    for i, dn in enumerate(doc_names):
        df = schwartz_word_scores(W_test_norms[:,i,:], tfidf_test[i], word_topic_scores[:,:,i,:], word_topic_sp[:,:,i,:], pre_tfidf_vectorizer, purity_score, word_count, only_doc_words)
        dn = re.sub('[\\\:/*?\[\]]', '', dn)
        df.to_excel(writer, str(i)+'-'+dn[:25])
        
    writer.save()
    writer.close()
    
def export_doc_tfidf_scores(test_corpusPP, doc_names, pre_trained_doc, filepath = 'tfidf_docs.xlsx', use_model = True):
    writer = pd.ExcelWriter(filepath, engine = 'xlsxwriter')
    
    _, tfidf_vectorizer = pickle.load( open( pre_trained_doc, "rb" ) )
    tf_vectorizer= CountVectorizer(min_df=1, ngram_range=(1,3), max_features=50000)
    
    if use_model:
        tfidf = tfidf_vectorizer.transform(test_corpusPP)
    else:
        tfidf = tfidf_vectorizer.fit_transform(test_corpusPP)
    
    tf = tf_vectorizer.fit_transform(test_corpusPP)

    word_list = []
    df_list = []
    
    tf_feature_names = tf_vectorizer.get_feature_names()
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
        
    for i, dn in enumerate(doc_names):
        word_list = []
        tfidf_doc = tfidf[i].toarray()[0]
        tf_doc = tf[i].toarray()[0]
        
        for idx in list(reversed(tf_doc.argsort())):
            if tf_doc[idx] <= 0:
                break
            if tf_feature_names[idx] not in tfidf_feature_names:
                continue
            idy = tfidf_feature_names.index(tf_feature_names[idx])
            word_list.append((tf_feature_names[idx], np.round(tf_doc[idx], 2), np.round(tfidf_doc[idy], 2), len(tf_feature_names[idx].split())))
        
        #df = pd.DataFrame(word_list, columns=["word (" + dn + ")", "tf (" + dn + ")",
        #                                                "tf-idf (" + dn + ")", "ngram (" + dn + ")"])
        df = pd.DataFrame(word_list, columns=["word", "tf", "tf-idf", "ngram"])
        
        dn = re.sub('[\\\:/*?\[\]]', '', dn)
        df.to_excel(writer, str(i)+'-'+dn[:25], index=False)
        
        #df_list.append(df)
        
    #score_df = pd.concat(df_list, axis=1)
    #score_df.to_excel(filepath)
    
    writer.save()
    writer.close()


In [4]:
def getLinksHTMLaref(page):
    """

    :param page: html of web page (here: Python home page) 
    :return: urls in that page 
    """
    start_link = page.find("a href=")
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1: end_quote]
    return url, end_quote

def getLinksHTML(page):
    """

    :param page: html of web page (here: Python home page) 
    :return: urls in that page 
    """
    start_link = page.find("href=")
    if start_link == -1:
        return None, 0
    start_quote = page.find('"htt', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1: end_quote]
    return url, end_quote

def getLinksXML(page):
    """

    :param page: html of web page (here: Python home page) 
    :return: urls in that page 
    """
    start_link = page.find("<link/>")
    if start_link == -1:
        return None, 0
    start_quote = page.find('http', start_link)
    end_quote = page.find('<', start_quote )
    url = page[start_quote : end_quote]
    return url, end_quote


def extractFromURL(surl):
    response = requests.get(surl)
    # parse html
    page = str(BeautifulSoup(response.content,"lxml"))
    is_XML = surl.endswith('xml')
    url_list = []
    while True:
        if is_XML:
            url, n = getLinksXML(page)
        else:
            url, n = getLinksHTML(page)
        
        page = page[n:]
        if url:
            if set(url_list).intersection(set(url)) == set() or len(set(url_list).intersection(set(url))) != len(url):
                url_list.append(url)
        else:
            break
        
    page = str(BeautifulSoup(response.content,"lxml"))
    stlink= surl.find("//")
    stlink= surl.find("/",stlink+2 )
    base = surl[0:stlink]
    while True:
        if is_XML:
            break
        else:
            url, n = getLinksHTMLaref(page)
        page = page[n:]
        if url:
            url = base+url
            if set(url_list).intersection(set(url)) == set() or len(set(url_list).intersection(set(url))) != len(url):
                url_list.append(url)
        else:
            break
            
    return url_list

## Main Functions

In [5]:
def preprocess_corpus(corpus):
    
    PPcorpus = [' '.join(list((extract_terms(doc, extra_process = ['stem'])['Stem']+' ')*extract_terms(doc, 
                extra_process = ['stem'])['TF'])) if doc != '' else '' for doc in corpus]
    return PPcorpus
    
def evaluate_docs(docs, nmf, tfidf_test, betaloss = 'kullback-leibler'):
    X_test = tfidf_test
    H_test = nmf.components_
    
    # Fit the NMF model
    t0 = time()

    W_test = nmf.transform(X_test)
    
    return W_test

def evaluate_test_corpus(pretrained_filepath, test_corpus, word_replacements):
    nmf_list, tfidf_vectorizer = pickle.load( open( pretrained_filepath, "rb" ) )
    
    W_test_list = []
    for i, nmf in enumerate(nmf_list):
        print("Fitting NMF for " + str(categories[i]))
        if word_replacements == []:
            tfidf_test = tfidf_vectorizer.transform(test_corpusPP)
        else:
            docs = [syn.replace_synoyms(test_corpusPP[idx], word_replacements[idx][i]) for idx in range(len(test_corpusPP))]
            tfidf_test = tfidf_vectorizer.transform(docs)
        
        n_features = tfidf_test.shape[1]
        W_test = evaluate_docs([], nmf, tfidf_test, betaloss = 'kullback-leibler')
        W_test_list.append(W_test)
        
    # Sum up sub topics
    W_test_norms = []
    for W_test in W_test_list:
        temp_docs = []
        for dd in W_test:
            temp = []
            for w in dd[:-1]:
                temp.append(100*w/(w+dd[-1]))
            temp_docs.append(temp)
        W_test_norms.append(temp_docs)

    W_test_norms = np.asarray(W_test_norms)
    W_test_norms = np.nan_to_num(W_test_norms)
    
    W_test_high = W_test_norms.max(axis=2).T
    
    # cumulated-normalized and raw
    return W_test_high, W_test_norms, np.asarray(W_test_list), tfidf_test

In [11]:
def print_training_topics(pretrained_filepath):
    nmf_list, tfidf_vectorizer = pickle.load( open( pretrained_filepath, "rb" ) )
    print("\nTopics in NMF model:")
    for i in range(10):
        print_top_words(nmf_list, i, tfidf_vectorizer, n_top_words=8, n_topics=3)

def add_corpus_txt(filepath, test_corpus):
    try:
        f = open(filepath, "r")
        txt = f.read()
        test_corpus.append(txt)
        f.close()
    except:
        test_corpus.append("")
        print("File not found - " + filepath)


def add_corpus_url(url, api_key, test_corpus):
    insightIP = 'http://178.62.229.16'
    insightPort = '8484'
    insightVersion = 'v1.0'

    insightSetting = insightIP + ':' + insightPort + '/api/' + insightVersion 
    request = '/text_analytics/url_scraper?' + 'url=' + url + '&' + 'api_key=' + api_key

    # send a request
    res = requests.get(insightSetting + request)
    if "Unauthorized Connection" in res.json():
        test_corpus.append("")
        print(res.json()["Unauthorized Connection"] + " - " + url)
    elif "Error" in res.json():
        test_corpus.append("")
        print(res.json()["Error"] + " - " + url)
    elif "text" in res.json():
        test_corpus.append(res.json()['text'])
        if res.json()['text'] == "":
            print("Empty text - " + url)
    else:
        test_corpus.append("")
        print("Empty text - " + url)

def print_interactive_test_results(W_test_high, W_test_norms, W_test_list, tfidf_test, doc_names, pre_trained_doc, purity_score, word_count, only_doc_words):
    pre_nmf_list, pre_tfidf_vectorizer = pickle.load( open( pre_trained_doc, "rb" ) )
    word_topic_scores, word_topic_sp = calculate_word_topic_scores(pre_nmf_list, W_test_list)
    
    
    interact(print_test_results,
             doc = (0, len(W_test_high)-1, 1),
             W_test_high=fixed(W_test_high),
             W_test_norms=fixed(W_test_norms),
             tfidf_test=fixed(tfidf_test),
             pre_nmf_list=fixed(pre_nmf_list),
             pre_tfidf_vectorizer=fixed(pre_tfidf_vectorizer),
             word_topic_scores=fixed(word_topic_scores),
             word_topic_sp=fixed(word_topic_sp),
             doc_names=fixed(doc_names),
             pre_trained_doc=fixed(pre_trained_doc),
             purity_score=fixed(purity_score),
             word_count=fixed(word_count),
             only_doc_words=fixed(only_doc_words))

### General Model

Nonnegative Matrix Factorization (NMF) method was first proposed by Lee and Seung paper1. The NMF is a method of decomposing a given nonnegative *X* matrix into *W* and *H* factors that contain nonnegative values. The value of the product of the two matrices obtained is approximately equal to the value of the decomposed matrix. In NMF, given a $W \times K$ nonnegative matrix $X = \left \{ x_{\nu, \tau} \right \}$ where $\nu = 1:V, i = 1:I \text{ and } \tau = 1:T$, we seek nonnegative matrices *W* and *H* such that

\begin{align*}
x_{\nu, \tau} \approx \left [ WH \right ]_{\nu, \tau} = \sum_{i} w_{\nu,i}h_{i,\tau}
\end{align*}

In this paper, we will refer to the $V\times I$ matrix W as the *template matrix*, and $I\times T$ matrix *H* the *excitation matrix*.


$X = WH$

$X$: documents X vocabulary. tf-idf is used for vocabulary.

$W$: documents X topics. Calculate a seperate W for each Schwartz Value using corresponding H.

$H$: topics X vocabulary. Calculate a seperate H for each Schwartz Value in the training process.


### Calculating Schwartz Word Scores

* We have a fixed (learned) H matrix for each Schwartz Value that holds word-topic distribution.
* We have W matrix for each document's Schwartz Values that holds topic-document distribution.
* H matrix gave us an idea about the important words for each Schwartz Value (by providing some kind of weights for each word), but actually the weights of those words can be different for each document.
* We propose two different methods to calculate those document spesific weighted word scores.
 * The summary of the approach is as follows: If a word appears in a document frequently (except stopwords) it can be considered as an important word for this document. If this words only occurs in a specific document then it is even more important. This is basically tf-idf which is our essential feature for this model. Moreover, if this word's tf-idf score obtained more from a specific topic rather than background info then we can accept it as an important indicator of this document and topic.
 * General equation: $X = WH$. Rather than directly using X or H, we figure in W to the calculation.   
 * Direct Schwartz: Multiply W and H only through the specific Schwartz Value Topics, excluding backgorund.
 * Purity Schwartz: Find the Schwartz Value purity of each word by taking the proportions of Direct Schwartz Score of this word to Direct Background Score (exclude Schwartz Value, include Backgroun) for each Schwartz Value. Then multiply this purity score with Direct Schwartz score to obtain Purity Schwartz Score.

**Schwartz Value WH carpimi:**

Her Schwartz Value icin hangi kelimelerin daha onemli oldugunu anlamak icin H matrisini inceleyebiliriz. Her H matirisi bir Schwartz Value ve backgorund corpus icin birden cok sub-topic seviyesinde kelime dagilimlarini barindirmkata. Yani 3 sub-topic seviyesinde Universalism ornegi dusunursek, modelimiz tek bir cesit universalism degil de 3 farkli universalism cesidi ogrenmeye calisiyor. Bu da bize her universalism ceisidi icin farkli kelime onemleri sunuyor. Fakat universalism'le alakali en onemli kelimeler ne dendigi zaman sub-topic lerden bahsetmek yerine tek bir cati altinda toplamak genel resmi anlamayi cok daha kolaylastirmakta. 

Fakat burda sadece H matrisi uzerinden bir toplam yaptigggimiz zaman dokumanlarin hangi Universalism sub-topic iyle alakali oldugu bilgisini atmis olmaktayiz. Bu sebeple her dokumanin neden belirli bir Schwartz Value'ya yoneldigini gosteren kelimeleri highlight etmek icin dokumanlarin sub-topic seviyesinde yoneldikleri Schwartz Value degerleri (W) ile kelimelerin sub-topic seviyesinde gruplandigi Schwartz Value (H) degerlerini carpip topluyoruz. Sonuc olarak bir dokuman icin onu siniflandirmamizda en cok etkileyen kelimeleri Schwartz Value lar arasinda da karsilastirma yapabildigimiz bir skorlama vermis oluyor. 

**Schwartz Value Purity**

Yukarida bahsedilen yontem butun Schwartz Value lar ve kelimeler arasinda goreceli bir karsilastirma yontemi saglamakta Fakat kelimeleri modellemekte kullandigimiz tf-idf ten gelen bir kelimenin bir dokumanda cokca gectigi icin onemi (skorunun) daha fazla gozukmekte. Bir yandan bunun etkisini azaltan ve ayni zamanda Schwartz Value purity konseptini uygulayan bir eklenti yapiyoruz. Kelimelerin her dokuman ve her Schwartz Value icin ne kadar saf oldugunu olcuyoruz. Ve bunu da buldugumuz skorla carpiyoruz. Boylece bu kelime sadece istedigimiz Schwartz Value da geciyorsa skoru gorecelei olarak artmis oluyor. Eger bu kelime cogunlukla istedigimiz Schwartz Value da degil de backgorund corpus ta geciorsa goreceli olarak skoru azalmis oluyor. Bu yontem ile aslinda istedigimiz Schwartz Value ile cok ilgili olmasa da sadece belirli dokumanlarda diger dokumanlara gore daha fazla gectigi icin skoru yuksek olan kelimelerin etkisini azaltmis oluyor. 


### Schwartz Value Word Scores

Understanding the behavior of the model is important to make deductions from it. Our model uses words to match the Schwartz Values with documents. The training of the model forms the $H$ matrix, which holds the word-topic distributions for each Schwartz Value. If we have used a classic, simpler NMF model, then, to find the importance order of the words for each Schwartz Value, we can directly take the marginal of $H$ matrix for each topic. But, our model offers much more information with its sub-topics for each Schwartz Values and semi-supervised nature. 

#### Direct Word Scores

Direct word score exploits the sub-topic structure of the model to come up with different word importance scores and orders for each document. $H$ matrix includes different word-distributions for each sub-topic of both a Schwartz Value and Background Corpus. In other words, if there is three sub-topics for \textit{Power} Schwartz Value in the $H$ matrix, then our model learns three different concept for Power Schwartz Value which provides different word scores for each concept. However, it is more logical to present a  single set of word scores for a Schwartz Value rather than three different word score sets obtained from sub-topics.

We can sum up values under sub-topics of H matrix to come up with a single word distribution with the cost of losing valuable sub-topic information. Thus, rather than finding a unified word-topic distribution for all documents, we calculate separate word scores for each document to highlight the important words that lead a document to be soft-classified as a specific Schwartz Value by dot product of documents' sub-topic level Schwartz Value scores ($W$) and words sub-topic level Schwartz Value scores ($H$). As a result, we obtain scores for all words under each Schwartz Value for each document that can be comparable with each other.

\begin{align*}
DWS = \sum_{i = 1}^{I/2} w_{\nu,i}h_{i,\tau}
\end{align*}

#### Purity Word Scores

\begin{align*}
DWS &= \sum_{i = 1}^{I/2} w_{\nu,i}h_{i,\tau}\\
BWS &= \sum_{i = I/2}^{T} w_{\nu,i}h_{i,\tau}\\
Purity &= \frac{DWS}{DWS+BWS} \\
PWS &= DWS * Purity
\end{align*}




In [7]:
# scores are multiplied by 100
def calculate_word_topic_scores(pre_nmf_list, W_test_list):
    n_topics = pre_nmf_list[0].components_.shape[0]-1
    H_list = []

    for pnmf in pre_nmf_list:
        aa = pnmf.components_
        H_list.append(aa/np.sum(aa,axis=1)[:, np.newaxis])
        #H_list.append(pnmf.components_)
    H_list = np.asarray(H_list)

    # [value, doc, word]
    word_topic_scores = []
    word_background_scores = []

    for i in range(10):
        temp_ts = []
        temp_bs = []
        for nt in range(n_topics):
            temp_ts.append(np.dot(W_test_list[i][:,nt][:, np.newaxis], H_list[i][nt,:][np.newaxis, :]))
            temp_bs.append(np.dot(W_test_list[i][:,n_topics:], H_list[i][n_topics:,:]))
        word_topic_scores.append(temp_ts)
        word_background_scores.append(temp_bs)

    word_topic_scores = np.asarray(word_topic_scores)
    word_background_scores = np.asarray(word_background_scores)

    word_topic_purity = np.nan_to_num(np.divide(word_topic_scores,word_topic_scores+word_background_scores))
    word_topic_sp = word_topic_scores*word_topic_purity

    word_topic_scores *= 100000
    word_topic_sp *= 100000
    
    return word_topic_scores, word_topic_sp

def find_top_word_scores(pre_tfidf_vectorizer, word_topic, word_count, tfidf_test_doc, only_doc_words):
    word_list = []
    feature_names = pre_tfidf_vectorizer.get_feature_names()
    
    for theme in range(10):
        tmp_word_list = []
        for nt in range(word_topic.shape[1]):
            tmp_list = []
            i = 0 
            for idx in list(reversed(word_topic[theme][nt].argsort())):
                if i == word_count:
                    break
                if not(only_doc_words and (tfidf_test_doc[0,idx] == 0)):
                    tmp_list.append((feature_names[idx], np.round(word_topic[theme][nt][idx], 3)))
                else:
                    i -= 1
                i += 1
            tmp_word_list.append(tmp_list)
        word_list.append(tmp_word_list)
    return word_list

def schwartz_word_scores(W_test_doc, tfidf_test_doc, word_topic_scores, word_topic_sp, pre_tfidf_vectorizer, purity_score, word_count, only_doc_words):
    if purity_score:
        top_scores = find_top_word_scores(pre_tfidf_vectorizer, word_topic_sp, word_count, tfidf_test_doc, only_doc_words)
    else:
        top_scores = find_top_word_scores(pre_tfidf_vectorizer, word_topic_scores, word_count, tfidf_test_doc, only_doc_words)
    
    schwartz_word_score = []
    schwartz_W_test = []
    for sch in schwartz:
        schwartz_word_score.append(top_scores[categories.index(sch)])            
        schwartz_W_test.append((sch.upper(), np.round(W_test_doc[categories.index(sch)], 2)))
        
    df_list = []
    for i, sws in enumerate(schwartz_word_score):
        for j, s in enumerate(sws):
            df_list.append(pd.DataFrame([("% " + schwartz_W_test[i][0]+" (" + str(j) + ")", schwartz_W_test[i][1][j])]+s,
                                        columns=[schwartz[i]+ " (" + str(j)+ ") - word",
                                                 schwartz[i]+ " (" + str(j)+ ") - score"]))
        score_df = pd.concat(df_list, axis=1)
    
    return score_df

## Print Pretrained Model's Topics

**nmf2_pretrained.p** or **nmf2_pretrained_pruned.p** includes pretrained NMF model generated using **Semi-Supervised-NMF-train-v2.ipynb** notebook. It has the nmf model and tfidf_vectorizer.

for the details of purned version see also **"OMTermz HZ.ipynb"**

In [8]:
def get_pretrained_words(pre_trained_doc, word_count, normalized=False, anti=0):
    pre_nmf_list, pre_tfidf_vectorizer = pickle.load( open( pre_trained_doc, "rb" ) )
    
    n_topics = pre_nmf_list[0].components_.shape[0]-1
    word_list = []
    feature_names = pre_tfidf_vectorizer.get_feature_names()
    
    nmf_comps = []
    for pnmf in pre_nmf_list:
        aa = pnmf.components_
        nmf_comps.append(1000*aa/np.sum(aa,axis=1)[:, np.newaxis])
    
    for theme in range(10):
        #word_topic = cumulate_W(pre_nmf_list[theme].components_.T,n_topics).T[anti]
        for nt in range(n_topics):
            if normalized:
                word_topic = nmf_comps[theme][nt]
            else:
                word_topic = pre_nmf_list[theme].components_[nt]
            tmp_list = []
            for i, idx in enumerate(list(reversed(word_topic.argsort()))):
                if i == word_count:
                    break
                tmp_list.append((feature_names[idx], np.round(word_topic[idx], 2)))
            word_list.append(tmp_list)
    
    schwartz_word_score = []
    for sch in schwartz:
        for nt in range(n_topics):
            schwartz_word_score.append(word_list[n_topics*categories.index(sch)+nt])
    
    df_list = []
    for i, a in enumerate(schwartz_word_score):
        df_list.append(pd.DataFrame(a, columns=[schwartz[i//n_topics]+ " (" + str(i%n_topics)+ ") - word",
                                                schwartz[i//n_topics]+ " (" + str(i%n_topics)+ ") - score"]))
    score_df = pd.concat(df_list, axis=1)
    
    return score_df

def export_pretrained_excel(pre_trained_doc, filepath, word_count=-1, anti=0):
    df = get_pretrained_words(pre_trained_doc, word_count, anti)
    df.to_excel(filepath)

In [12]:
pre_trained_doc = "pretrained_v3_t3_h10_1409.p"
print_training_topics(pre_trained_doc)


Topics in NMF model:
[96m[1muniversalism[0m
[1mTopic #0: [0menvironmental movement state marriage social party samesex green
[1mTopic #1: [0mright environmental law social human peace war state
[1mTopic #2: [0menergy ecology peace use human think system one

[96m[1mhedonism[0m
[1mTopic #0: [0mpain love orgasm one empathy people may happiness
[1mTopic #1: [0mone happiness pleasure social desire life also may
[1mTopic #2: [0mmay one experience also emotion shame person pleasure

[96m[1machievement[0m
[1mTopic #0: [0msocial class capital society labour inequality work system
[1mTopic #1: [0mwork hour individual social goal high management time
[1mTopic #2: [0mcapital status social human need individual people high

[96m[1mpower[0m
[1mTopic #0: [0mpower use experiment milgram make control process machine
[1mTopic #1: [0mtime state wealth power collapse class may also
[1mTopic #2: [0mauthority power veto bill social state individual may

[96m[1mself-di

If normalized is True, then word-topic matrix is normalized on topics and values are multiplied by 1000 to make them readable.

In [10]:
get_pretrained_words(pre_trained_doc, normalized=True, word_count=10)

Unnamed: 0,universalism (0) - word,universalism (0) - score,universalism (1) - word,universalism (1) - score,universalism (2) - word,universalism (2) - score,benevolence (0) - word,benevolence (0) - score,benevolence (1) - word,benevolence (1) - score,...,stimulation (1) - word,stimulation (1) - score,stimulation (2) - word,stimulation (2) - score,self-direction (0) - word,self-direction (0) - score,self-direction (1) - word,self-direction (1) - score,self-direction (2) - word,self-direction (2) - score
0,environmental,6.07,right,5.45,energy,5.33,law,8.46,good,6.02,...,tourism,35.28,sport,26.21,creativity,19.15,innovation,8.64,yes,11.11
1,movement,4.95,environmental,4.9,ecology,4.84,truth,7.45,evil,5.85,...,travel,13.86,travel,7.28,play,8.73,idea,6.75,independence,7.19
2,state,4.77,law,4.03,peace,4.7,ethic,6.61,one,5.41,...,million,8.09,adventure,6.45,creative,8.35,unite,6.06,invention,5.59
3,marriage,4.28,social,3.85,use,4.08,forgiveness,6.51,justice,4.89,...,tourist,7.75,exploration,6.39,intelligence,4.58,intelligence,5.43,bully,5.02
4,social,4.27,human,3.68,human,3.98,theory,6.18,pardon,4.89,...,international,7.63,use,5.54,new,4.15,territory,5.36,positive,4.75
5,party,4.11,peace,3.65,think,3.64,good,5.79,lie,4.74,...,country,7.42,include,5.29,process,4.0,state,4.92,task,4.7
6,samesex,4.05,war,3.45,system,3.62,one,5.47,trust,4.43,...,billion,6.32,game,5.09,theory,3.96,new,4.75,individual,4.54
7,green,3.74,state,3.44,one,3.42,natural,4.96,individual,4.27,...,world,6.11,may,5.0,work,3.87,group,4.72,emotion,4.38
8,woman,3.27,use,3.23,social,3.39,may,3.91,moral,4.07,...,destination,5.29,also,4.86,also,3.58,curiosity,4.35,yes yes,4.33
9,right,3.26,specie,3.22,theory,3.2,natural law,3.67,social,3.99,...,unite,5.21,explorer,4.81,state,3.53,music,4.11,performance,4.22


Exports all word-score pairs in vocabulary (50000 words)

In [None]:
#export_pretrained_excel(pre_trained_doc, filepath='pretrained_words.xlsx')

## Evaluating Different Documents

Adding two example documents to the test_corpus.

In [12]:
#Pope ted talk, https://www.ted.com/speakers/pope_francis
# US Department of Defense, https://www.defense.gov/About/
doc_names = ["pope.txt", "dod.txt", "https://www.nationalgeographic.com/science/space/solar-system/earth/", "https://sadasd", "asdasd"]
#doc_names = ["pope.txt", "dod.txt"]

In [13]:
def add_corpus_docs(doc_names, test_corpus, insigth_api_key):
    for doc in doc_names:
        if re.match("^(http|https)://", doc) is None:
            add_corpus_txt(doc, test_corpus)
        else:
            add_corpus_url(doc, insigth_api_key, test_corpus)

Crawling a website using InSight API and adding its text to test_corpus.

Always check the text, added to the corpus via add_corpus_url. Because websites can have unexpected embedded texts.

In [14]:
test_corpus = []
insigth_api_key = "" #needs to be filled
add_corpus_docs(doc_names, test_corpus, insigth_api_key)

Content Not Found - https://sadasd
File not found - asdasd


In [15]:
# Clean Corpus
test_corpusPP = [tp.clean_text(doc) for doc in test_corpus]

Fix bad wording:  0.005984067916870117 s
Tokenize:  0.006982564926147461 s
Remove stopwords and Lemmatize:  2.978036880493164 s

Fix bad wording:  0.0019948482513427734 s
Tokenize:  0.003989696502685547 s
Remove stopwords and Lemmatize:  0.009973764419555664 s

Fix bad wording:  0.0019948482513427734 s
Tokenize:  0.0029921531677246094 s
Remove stopwords and Lemmatize:  0.0069811344146728516 s

Fix bad wording:  0.0 s
Tokenize:  0.0 s
Remove stopwords and Lemmatize:  0.0 s

Fix bad wording:  0.0 s
Tokenize:  0.0 s
Remove stopwords and Lemmatize:  0.0 s



### Synonym Things

In [21]:
# word_count = 150
# synonym_count = 10
# similarity_span = 10

# df_words = get_pretrained_words(pre_trained_doc, word_count)
# cols = df_words.columns
# for c_id in range(1, len(cols), 2):
#     df_words.drop(columns=[cols[c_id]], inplace=True)
# synonyms_df = syn.important_train_synonyms(df_words, word_count, synonym_count, similarity_span)

In [22]:
# pre_nmf_list, pre_tfidf_vectorizer = pickle.load( open( pre_trained_doc, "rb" ) )
# trained_vocabulary = pre_tfidf_vectorizer.get_feature_names()
# word_replacements = syn.find_doc_word_synonyms(test_corpusPP, synonyms_df, trained_vocabulary)

Evaluate model for the test_corpus.

In [16]:
W_test_high, W_test_norms, W_test_list, tfidf_test = evaluate_test_corpus(pre_trained_doc, test_corpusPP, [])

Fitting NMF for universalism
Fitting NMF for hedonism
Fitting NMF for achievement
Fitting NMF for power
Fitting NMF for self-direction
Fitting NMF for benevolence
Fitting NMF for conformity
Fitting NMF for tradition
Fitting NMF for stimulation
Fitting NMF for security


Results for test_corpus

(Word scores are calculated after the word-topic matrix is normalized on topics)

(Word scores are multiplied by 100000 to make values more readeable)

When "only_doc_words" parameter set to True, the table will only show words from the documents.

In [17]:
print_interactive_test_results(W_test_high, W_test_norms, W_test_list, tfidf_test, doc_names, pre_trained_doc, purity_score = False, word_count = 10, only_doc_words=True)

In [77]:
df = export_to_excel(W_test_high, test_corpus, doc_names, filepath = 'output.xlsx')
df.head()

Unnamed: 0,name,universalism,benevolence,conformity,tradition,security,power,achievement,hedonism,stimulation,self-direction,Text
0,pope.txt,4.145273,70.850015,69.740629,65.097958,12.640265,39.438785,12.978302,51.905642,42.852234,7.511727,"Good evening â€“ or, good morning, I am not su..."
1,dod.txt,79.515827,0.563155,41.912356,0.170443,89.551388,58.914988,45.434234,3.1e-05,52.781536,66.028535,\nOn behalf of the Secretary of Defense and De...
2,https://www.nationalgeographic.com/science/spa...,89.320613,0.024123,6.440324,0.259553,59.058197,62.087888,15.350474,9.623754,80.428628,0.037698,"Earth, our home planet, is the only planet in ..."
3,https://sadasd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,asdasd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [31]:
df = export_to_csv(W_test_norm, test_corpus, doc_names, filepath = 'output.csv')
df.head()

Unnamed: 0,name,universalism,benevolence,conformity,tradition,security,power,achievement,hedonism,stimulation,self-direction,Text
0,pope.txt,27.988363,67.763382,61.13839,51.092589,32.598644,26.016194,28.216559,45.193225,52.891968,46.685189,"Good evening â€“ or, good morning, I am not su..."
1,dod.txt,63.816557,12.494627,16.752545,38.126135,81.431774,59.355575,21.740743,5.620166,30.827866,28.070313,\nOn behalf of the Secretary of Defense and De...
2,https://www.nationalgeographic.com/science/spa...,53.513475,32.123116,20.938325,3.036925,65.892862,55.517597,0.250744,44.063394,78.876438,13.837296,"Earth, our home planet, is the only planet in ..."
3,https://sadasd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,asdasd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [19]:
# When word_count is -1, it exports all the words
# When only_doc_words is set to True, it exports only the words used in the documents

# if you want proper document names in the output file change 'doc_names' list.
export_word_scores_excel(W_test_norms, W_test_list, tfidf_test, doc_names, pre_trained_doc, filepath = 'ssnmf_words.xlsx', purity_score=False, word_count=-1, only_doc_words=True)