One vs All Method

Train NMF for each topic separately.

Use all Wiki articles as Background Corpus.

In [1]:
import pandas as pd
import numpy as np
from time import time

import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import matplotlib.pyplot as plt
from math import pi

from omterms.interface import *

import pickle

from ipywidgets import interact, fixed
from IPython.display import display

import requests
import json

import re

import requests
from bs4 import BeautifulSoup

## Plots and Prints

In [2]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']

def plot_radar_chart(doc_topic_cumul, doc, doc_names):
    # ------- PART 1: Create background
 
    # number of variablecategories
    
    
    schwartz_dist = []
    for sch in schwartz:
        schwartz_dist.append(doc_topic_cumul[doc][categories.index(sch)])
    
    N = len(schwartz)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    plt.figure(figsize=(8,8))
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)

    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], schwartz)

    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([25,50,75], ["25","50","75"], color="grey", size=7)
    plt.ylim(0,100)


    # ------- PART 2: Add plots

    # Plot each individual = each line of the data
    # I don't do a loop, because plotting more than 3 groups makes the chart unreadable

    # Ind1
    values = list(schwartz_dist) + list(schwartz_dist[:1])
    ax.plot(angles, values, linewidth=1, linestyle='solid')
    ax.fill(angles, values, 'b', alpha=0.1)

    # Add legend
    #plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title("Schwartz Chart - " + doc_names[doc])
    plt.savefig("Schwartz_Chart_" + str(doc))
    plt.show()
    
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
    
def print_top_words(model, theme, tfidf_vectorizer, n_top_words, n_topics=3):
    feature_names = tfidf_vectorizer.get_feature_names()
    print(color.CYAN + color.BOLD + categories[theme] + color.END)
    for topic_idx, topic in enumerate(model[theme].components_):
        if topic_idx / n_topics == 1:
            break
        message = color.BOLD + "Topic #%d: " % topic_idx + color.END
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def print_cumulative_train_doc_topics(data, doc_topic, doc, n_best):
    test_theme = data.iloc[doc]['theme']
    print(color.BOLD + "Doc " + str(doc) + color.RED +  " (" + test_theme + ")\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()
    
def print_cumulative_test_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()

def print_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    for i in doc_topic[doc].argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i//3] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, doc_topic[doc][i]), end='')    
    print()

def print_train_results(doc_topic, doc, corpus, data):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_train_doc_topics(data, doc_topic, doc, 11) 
    print()
    
    plot_radar_chart(doc_topic, doc)
    
    
def print_test_results(doc, doc_topic, test_corpusPP, pre_nmf_list, pre_tfidf_vectorizer, word_topic_scores, word_topic_sp,
                       corpus, doc_names, pre_trained_doc, purity_score, word_count, only_doc_words):
    print(color.BOLD + "Document " + str(doc) + ": " + doc_names[doc] + color.END)
    #print()
    #print(color.BOLD + "Text: " + color.END)
    #print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_test_doc_topics(doc_topic, doc, 11)
    print()
    
    plot_radar_chart(doc_topic, doc, doc_names)
    print()
    
    df_scores = schwartz_word_scores(doc, W_test_norm, test_corpusPP, word_topic_scores, word_topic_sp, pre_tfidf_vectorizer, purity_score, word_count, only_doc_words)    
    
    display(df_scores)
    

## Helper Functions

In [3]:
def cumulate_W(W, n_topics):
    W_cumul = []
    for d in W:
        temp = []
        for i in range(W.shape[1]//n_topics):
            temp.append(d[i*n_topics:(i+1)*n_topics].sum())
        W_cumul.append(temp)

    W_cumul = np.asarray(W_cumul)
    
    return W_cumul

def normalize_W(W):
    W_cumul_norm = W/(W.sum(axis=1).reshape(W.shape[0], 1))
    W_cumul_norm *= 100
    
    return W_cumul_norm

def prepare_export(W, docs, doc_names, filepath):
    schwartz_dist = []
    for doc in range(len(docs)):
        temp_dist = []
        for sch in schwartz:
            temp_dist.append(W[doc][categories.index(sch)])
        schwartz_dist.append(temp_dist)
    schwartz_dist = np.asarray(schwartz_dist)
    
    df = pd.DataFrame(data=schwartz_dist,index = range(len(schwartz_dist)), columns=schwartz)
    df['Text'] = docs
    df["name"] = doc_names
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    
    return df
    
def export_to_excel(W, docs, doc_names, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put xlsx as file extension '''
    
    df = prepare_export(W, docs, doc_names, filepath)
    df.to_excel(filepath)
    return df

def export_to_csv(W, docs, doc_names, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put csv as file extension '''
    
    df = prepare_export(W, docs, doc_names, filepath)
    df.to_csv(filepath)
    return df

def export_word_scores_excel(W_test_norm, W_test_list, doc_names, pre_trained_doc, filepath, purity_score=False, word_count=10, only_doc_words=False):
    writer = pd.ExcelWriter(filepath, engine = 'xlsxwriter')
    
    pre_nmf_list, pre_tfidf_vectorizer = pickle.load( open( pre_trained_doc, "rb" ) )
    word_topic_scores, word_topic_sp = calculate_word_topic_scores(pre_nmf_list, W_test_list)
    
    for i, dn in enumerate(doc_names):
        df = schwartz_word_scores(i, W_test_norm, test_corpusPP, word_topic_scores, word_topic_sp, pre_tfidf_vectorizer, purity_score, word_count, only_doc_words)
        dn = re.sub('[\\\:/*?\[\]]', '', dn)
        df.to_excel(writer, str(i)+'-'+dn[:25])
        
    writer.save()
    writer.close()
    
def export_doc_tfidf_scores(tfidf_test, doc_names, pre_trained_doc, filepath):
    writer = pd.ExcelWriter(filepath, engine = 'xlsxwriter')
    
    pre_nmf_list, pre_tfidf_vectorizer = pickle.load( open( pre_trained_doc, "rb" ) )
    
    for i, dn in enumerate(doc_names):
        word_list = []
        tfidf_doc = tfidf_test[i].toarray()[0]
        feature_names = pre_tfidf_vectorizer.get_feature_names()
        for idx in list(reversed(tfidf_doc.argsort())):
            if tfidf_doc[idx] < 0.0005:
                break
            word_list.append((feature_names[idx], np.round(tfidf_doc[idx], 3)))

        dn = re.sub('[\\\:/*?\[\]]', '', dn)
        pd.DataFrame(word_list, columns=["Word", "tf-idf"]).to_excel(writer, dn[:30])
    
    writer.save()
    writer.close()

In [4]:
def getLinksHTMLaref(page):
    """

    :param page: html of web page (here: Python home page) 
    :return: urls in that page 
    """
    start_link = page.find("a href=")
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1: end_quote]
    return url, end_quote

def getLinksHTML(page):
    """

    :param page: html of web page (here: Python home page) 
    :return: urls in that page 
    """
    start_link = page.find("href=")
    if start_link == -1:
        return None, 0
    start_quote = page.find('"htt', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1: end_quote]
    return url, end_quote

def getLinksXML(page):
    """

    :param page: html of web page (here: Python home page) 
    :return: urls in that page 
    """
    start_link = page.find("<link/>")
    if start_link == -1:
        return None, 0
    start_quote = page.find('http', start_link)
    end_quote = page.find('<', start_quote )
    url = page[start_quote : end_quote]
    return url, end_quote


def extractFromURL(surl):
    response = requests.get(surl)
    # parse html
    page = str(BeautifulSoup(response.content,"lxml"))
    is_XML = surl.endswith('xml')
    url_list = []
    while True:
        if is_XML:
            url, n = getLinksXML(page)
        else:
            url, n = getLinksHTML(page)
        
        page = page[n:]
        if url:
            if set(url_list).intersection(set(url)) == set() or len(set(url_list).intersection(set(url))) != len(url):
                url_list.append(url)
        else:
            break
        
    page = str(BeautifulSoup(response.content,"lxml"))
    stlink= surl.find("//")
    stlink= surl.find("/",stlink+2 )
    base = surl[0:stlink]
    while True:
        if is_XML:
            break
        else:
            url, n = getLinksHTMLaref(page)
        page = page[n:]
        if url:
            url = base+url
            if set(url_list).intersection(set(url)) == set() or len(set(url_list).intersection(set(url))) != len(url):
                url_list.append(url)
        else:
            break
            
    return url_list

## Main Functions

In [5]:
def preprocess_corpus(corpus):
    
    PPcorpus = [' '.join(list((extract_terms(doc, extra_process = ['stem'])['Stem']+' ')*extract_terms(doc, 
                extra_process = ['stem'])['TF'])) if doc != '' else '' for doc in corpus]
    return PPcorpus
    
def evaluate_docs(docs, nmf, tfidf_test, betaloss = 'kullback-leibler'):
    X_test = tfidf_test
    H_test = nmf.components_
    
    # Fit the NMF model
    t0 = time()

    W_test = nmf.transform(X_test)
    
    return W_test

In [6]:
def print_training_topics(pretrained_filepath):
    nmf_list, tfidf_vectorizer = pickle.load( open( pretrained_filepath, "rb" ) )
    print("\nTopics in NMF model:")
    for i in range(10):
        print_top_words(nmf_list, i, tfidf_vectorizer, n_top_words=5, n_topics=3)

def add_corpus_txt(filepath, test_corpus):
    try:
        f = open(filepath, "r")
        txt = f.read()
        test_corpus.append(txt)
        f.close()
    except:
        test_corpus.append("")
        print("File not found - " + filepath)


def add_corpus_url(url, api_key, test_corpus):
    insightIP = 'http://178.62.229.16'
    insightPort = '8484'
    insightVersion = 'v1.0'

    insightSetting = insightIP + ':' + insightPort + '/api/' + insightVersion 
    request = '/text_analytics/url_scraper?' + 'url=' + url + '&' + 'api_key=' + api_key

    # send a request
    res = requests.get(insightSetting + request)
    if "Unauthorized Connection" in res.json():
        test_corpus.append("")
        print(res.json()["Unauthorized Connection"] + " - " + url)
    elif "Error" in res.json():
        test_corpus.append("")
        print(res.json()["Error"] + " - " + url)
    elif "text" in res.json():
        test_corpus.append(res.json()['text'])
        if res.json()['text'] == "":
            print("Empty text - " + url)
    else:
        test_corpus.append("")
        print("Empty text - " + url)
    
def evaluate_test_corpus(pretrained_filepath, test_corpus):
    nmf_list, tfidf_vectorizer = pickle.load( open( pretrained_filepath, "rb" ) )
    test_corpusPP = preprocess_corpus(test_corpus)
    print()
    print('-'*30)
    print()
    print("Extracting tf-idf features for NMF...")
    t0 = time()
    tfidf_test = tfidf_vectorizer.transform(test_corpusPP)
    #tfidf = tfidf_vectorizer.transform(corpusX)
    print(tfidf_test.shape[1])
    n_features = tfidf_test.shape[1]
    print("done in %0.2fs." % (time() - t0))

    W_test_list = []
    for i, nmf in enumerate(nmf_list):
        print("Fitting NMF for " + str(categories[i]))
        W_test = evaluate_docs(test_corpusPP, nmf, tfidf_test, betaloss = 'kullback-leibler')
        W_test_list.append(W_test)
        
    # Sum up sub topics
    W_test_norm_list = []
    for W in W_test_list:
        W_test_cumul = cumulate_W(W, n_topics=3)
        W_test_norm = normalize_W(W_test_cumul)
        W_test_norm_list.append(W_test_norm)
    W_test_norm = np.asarray(W_test_norm_list).T[0]
    W_test_norm = np.nan_to_num(W_test_norm)

    # cumulated-normalized and raw
    return W_test_norm, np.asarray(W_test_list), test_corpusPP, tfidf_test

def print_interactive_test_results(W_test_norm, W_test_list, test_corpus, test_corpusPP, doc_names, pre_trained_doc, purity_score, word_count, only_doc_words):
    pre_nmf_list, pre_tfidf_vectorizer = pickle.load( open( pre_trained_doc, "rb" ) )
    word_topic_scores, word_topic_sp = calculate_word_topic_scores(pre_nmf_list, W_test_list)
    
    interact(print_test_results,
             doc = (0, len(W_test_norm)-1, 1),
             doc_topic=fixed(W_test_norm),
             test_corpusPP=fixed(test_corpusPP),
             pre_nmf_list=fixed(pre_nmf_list),
             pre_tfidf_vectorizer=fixed(pre_tfidf_vectorizer),
             word_topic_scores=fixed(word_topic_scores),
             word_topic_sp=fixed(word_topic_sp),
             corpus=fixed(test_corpus),
             doc_names=fixed(doc_names),
             pre_trained_doc=fixed(pre_trained_doc),
             purity_score=fixed(purity_score),
             word_count=fixed(word_count),
             only_doc_words=fixed(only_doc_words))

### General Model

Nonnegative Matrix Factorization (NMF) method was first proposed by Lee and Seung paper1. The NMF is a method of decomposing a given nonnegative *X* matrix into *W* and *H* factors that contain nonnegative values. The value of the product of the two matrices obtained is approximately equal to the value of the decomposed matrix. In NMF, given a $W \times K$ nonnegative matrix $X = \left \{ x_{\nu, \tau} \right \}$ where $\nu = 1:V, i = 1:I \text{ and } \tau = 1:T$, we seek nonnegative matrices *W* and *H* such that

\begin{align*}
x_{\nu, \tau} \approx \left [ WH \right ]_{\nu, \tau} = \sum_{i} w_{\nu,i}h_{i,\tau}
\end{align*}

In this paper, we will refer to the $V\times I$ matrix W as the *template matrix*, and $I\times T$ matrix *H* the *excitation matrix*.


$X = WH$

$X$: documents X vocabulary. tf-idf is used for vocabulary.

$W$: documents X topics. Calculate a seperate W for each Schwartz Value using corresponding H.

$H$: topics X vocabulary. Calculate a seperate H for each Schwartz Value in the training process.


### Calculating Schwartz Word Scores

* We have a fixed (learned) H matrix for each Schwartz Value that holds word-topic distribution.
* We have W matrix for each document's Schwartz Values that holds topic-document distribution.
* H matrix gave us an idea about the important words for each Schwartz Value (by providing some kind of weights for each word), but actually the weights of those words can be different for each document.
* We propose two different methods to calculate those document spesific weighted word scores.
 * The summary of the approach is as follows: If a word appears in a document frequently (except stopwords) it can be considered as an important word for this document. If this words only occurs in a specific document then it is even more important. This is basically tf-idf which is our essential feature for this model. Moreover, if this word's tf-idf score obtained more from a specific topic rather than background info then we can accept it as an important indicator of this document and topic.
 * General equation: $X = WH$. Rather than directly using X or H, we figure in W to the calculation.   
 * Direct Schwartz: Multiply W and H only through the specific Schwartz Value Topics, excluding backgorund.
 * Purity Schwartz: Find the Schwartz Value purity of each word by taking the proportions of Direct Schwartz Score of this word to Direct Background Score (exclude Schwartz Value, include Backgroun) for each Schwartz Value. Then multiply this purity score with Direct Schwartz score to obtain Purity Schwartz Score.

**Schwartz Value WH carpimi:**

Her Schwartz Value icin hangi kelimelerin daha onemli oldugunu anlamak icin H matrisini inceleyebiliriz. Her H matirisi bir Schwartz Value ve backgorund corpus icin birden cok sub-topic seviyesinde kelime dagilimlarini barindirmkata. Yani 3 sub-topic seviyesinde Universalism ornegi dusunursek, modelimiz tek bir cesit universalism degil de 3 farkli universalism cesidi ogrenmeye calisiyor. Bu da bize her universalism ceisidi icin farkli kelime onemleri sunuyor. Fakat universalism'le alakali en onemli kelimeler ne dendigi zaman sub-topic lerden bahsetmek yerine tek bir cati altinda toplamak genel resmi anlamayi cok daha kolaylastirmakta. 

Fakat burda sadece H matrisi uzerinden bir toplam yaptigggimiz zaman dokumanlarin hangi Universalism sub-topic iyle alakali oldugu bilgisini atmis olmaktayiz. Bu sebeple her dokumanin neden belirli bir Schwartz Value'ya yoneldigini gosteren kelimeleri highlight etmek icin dokumanlarin sub-topic seviyesinde yoneldikleri Schwartz Value degerleri (W) ile kelimelerin sub-topic seviyesinde gruplandigi Schwartz Value (H) degerlerini carpip topluyoruz. Sonuc olarak bir dokuman icin onu siniflandirmamizda en cok etkileyen kelimeleri Schwartz Value lar arasinda da karsilastirma yapabildigimiz bir skorlama vermis oluyor. 

**Schwartz Value Purity**

Yukarida bahsedilen yontem butun Schwartz Value lar ve kelimeler arasinda goreceli bir karsilastirma yontemi saglamakta Fakat kelimeleri modellemekte kullandigimiz tf-idf ten gelen bir kelimenin bir dokumanda cokca gectigi icin onemi (skorunun) daha fazla gozukmekte. Bir yandan bunun etkisini azaltan ve ayni zamanda Schwartz Value purity konseptini uygulayan bir eklenti yapiyoruz. Kelimelerin her dokuman ve her Schwartz Value icin ne kadar saf oldugunu olcuyoruz. Ve bunu da buldugumuz skorla carpiyoruz. Boylece bu kelime sadece istedigimiz Schwartz Value da geciyorsa skoru gorecelei olarak artmis oluyor. Eger bu kelime cogunlukla istedigimiz Schwartz Value da degil de backgorund corpus ta geciorsa goreceli olarak skoru azalmis oluyor. Bu yontem ile aslinda istedigimiz Schwartz Value ile cok ilgili olmasa da sadece belirli dokumanlarda diger dokumanlara gore daha fazla gectigi icin skoru yuksek olan kelimelerin etkisini azaltmis oluyor. 


### Schwartz Value Word Scores

Understanding the behavior of the model is important to make deductions from it. Our model uses words to match the Schwartz Values with documents. The training of the model forms the $H$ matrix, which holds the word-topic distributions for each Schwartz Value. If we have used a classic, simpler NMF model, then, to find the importance order of the words for each Schwartz Value, we can directly take the marginal of $H$ matrix for each topic. But, our model offers much more information with its sub-topics for each Schwartz Values and semi-supervised nature. 

#### Direct Word Scores

Direct word score exploits the sub-topic structure of the model to come up with different word importance scores and orders for each document. $H$ matrix includes different word-distributions for each sub-topic of both a Schwartz Value and Background Corpus. In other words, if there is three sub-topics for \textit{Power} Schwartz Value in the $H$ matrix, then our model learns three different concept for Power Schwartz Value which provides different word scores for each concept. However, it is more logical to present a  single set of word scores for a Schwartz Value rather than three different word score sets obtained from sub-topics.

We can sum up values under sub-topics of H matrix to come up with a single word distribution with the cost of losing valuable sub-topic information. Thus, rather than finding a unified word-topic distribution for all documents, we calculate separate word scores for each document to highlight the important words that lead a document to be soft-classified as a specific Schwartz Value by dot product of documents' sub-topic level Schwartz Value scores ($W$) and words sub-topic level Schwartz Value scores ($H$). As a result, we obtain scores for all words under each Schwartz Value for each document that can be comparable with each other.

\begin{align*}
DWS = \sum_{i = 1}^{I/2} w_{\nu,i}h_{i,\tau}
\end{align*}

#### Purity Word Scores

\begin{align*}
DWS &= \sum_{i = 1}^{I/2} w_{\nu,i}h_{i,\tau}\\
BWS &= \sum_{i = I/2}^{T} w_{\nu,i}h_{i,\tau}\\
Purity &= \frac{DWS}{DWS+BWS} \\
PWS &= DWS * Purity
\end{align*}




In [7]:
# scores are multiplied by 100
def calculate_word_topic_scores(pre_nmf_list, W_test_list, n_topics=3):
    H_list = []
    for pnmf in pre_nmf_list:
        H_list.append(pnmf.components_)
    H_list = np.asarray(H_list)
    
    # [value, doc, word]
    word_topic_scores = []
    word_background_scores = []
    
    for i in range(10):
        word_topic_scores.append(np.dot(W_test_list[i][:,:n_topics], H_list[i][:n_topics,:]))
        word_background_scores.append(np.dot(W_test_list[i][:,n_topics:], H_list[i][n_topics:,:]))
        
    word_topic_scores = np.asarray(word_topic_scores)
    word_background_scores = np.asarray(word_background_scores)
    
    word_topic_purity = np.nan_to_num(np.divide(word_topic_scores,word_topic_scores+word_background_scores))
    word_topic_sp = word_topic_scores*word_topic_purity
    
    word_topic_scores *= 100
    word_topic_sp *= 100
    
    return word_topic_scores, word_topic_sp

def find_top_word_scores(pre_tfidf_vectorizer, word_topic, word_count, test_corpusPP, only_doc_words):
    word_list = []
    feature_names = pre_tfidf_vectorizer.get_feature_names()

    tcpp = test_corpusPP.split()
    
    for theme in range(10):
        tmp_list = []
        i = 0 
        for idx in list(reversed(word_topic[theme].argsort())):
            if i == word_count:
                break
            if not(only_doc_words and (feature_names[idx] not in tcpp)):
                tmp_list.append((feature_names[idx], np.round(word_topic[theme][idx], 3)))
            else:
                i -= 1
            i += 1
        word_list.append(tmp_list)
    return word_list

def schwartz_word_scores(doc, W_test_norm, test_corpusPP, word_topic_scores, word_topic_sp, pre_tfidf_vectorizer, purity_score, word_count, only_doc_words):
    if purity_score:
        top_scores = find_top_word_scores(pre_tfidf_vectorizer, word_topic_sp[:,doc,:], word_count, test_corpusPP[doc], only_doc_words)
    else:
        top_scores = find_top_word_scores(pre_tfidf_vectorizer, word_topic_scores[:,doc,:], word_count, test_corpusPP[doc], only_doc_words)
    
    schwartz_word_score = []
    schwartz_W_test = []
    for sch in schwartz:
        schwartz_word_score.append(top_scores[categories.index(sch)])
        schwartz_W_test.append((sch.upper(), np.round(W_test_norm[doc][categories.index(sch)], 3)))
        
    df_list = []
    for i, a in enumerate(schwartz_word_score):
        df_list.append(pd.DataFrame([schwartz_W_test[i]]+a, columns=[schwartz[i]+" - word", schwartz[i]+" - score"]))
    score_df = pd.concat(df_list, axis=1)
    
    return score_df

## Print Pretrained Model's Topics

**nmf2_pretrained.p** or **nmf2_pretrained_pruned.p** includes pretrained NMF model generated using **Semi-Supervised-NMF-train-v2.ipynb** notebook. It has the nmf model and tfidf_vectorizer.

for the details of purned version see also **"OMTermz HZ.ipynb"**

In [8]:
def get_pretrained_words(pre_trained_doc, word_count, anti=0):
    pre_nmf_list, pre_tfidf_vectorizer = pickle.load( open( pre_trained_doc, "rb" ) )
    
    word_list = []
    feature_names = pre_tfidf_vectorizer.get_feature_names()

    for theme in range(10):
        word_topic = cumulate_W(pre_nmf_list[theme].components_.T,3).T[anti]
        tmp_list = []
        for i, idx in enumerate(list(reversed(word_topic.argsort()))):
            if i == word_count:
                break
            tmp_list.append((feature_names[idx], np.round(word_topic[idx], 3)))
        word_list.append(tmp_list)
    
    schwartz_word_score = []
    for sch in schwartz:
        schwartz_word_score.append(word_list[categories.index(sch)])
        
    df_list = []
    for i, a in enumerate(schwartz_word_score):
        df_list.append(pd.DataFrame(a, columns=[schwartz[i]+" - word", schwartz[i]+" - score"]))
    score_df = pd.concat(df_list, axis=1)
    
    return score_df

def export_pretrained_excel(pre_trained_doc, filepath, word_count=-1, anti=0):
    df = get_pretrained_words(pre_trained_doc, word_count, anti)
    df.to_excel(filepath)

In [16]:
pre_trained_doc = "nmf2_pretrained_pruned.p"
print_training_topics(pre_trained_doc)


Topics in NMF model:
[96m[1muniversalism[0m
[1mTopic #0: [0mstate human analyt form topic
[1mTopic #1: [0mintern peopl creation develop grow
[1mTopic #2: [0mgroup first disarma help econom

[96m[1mhedonism[0m
[1mTopic #0: [0msee psycholog problem sometim western
[1mTopic #1: [0mtime reaction repres research simpli
[1mTopic #2: [0mshock studi import less schadenfreud

[96m[1machievement[0m
[1mTopic #0: [0msocial role motiv other tribe
[1mTopic #1: [0mpeopl theori scale merchant support
[1mTopic #2: [0mrelat primari owen top increasingli

[96m[1mpower[0m
[1mTopic #0: [0mmay lower compos specialti peopl
[1mTopic #1: [0marticl toxic environ belong idea
[1mTopic #2: [0mleadership tool partner bia law

[96m[1mself-direction[0m
[1mTopic #0: [0muse gener resourc carrol interperson
[1mTopic #1: [0mmade take known variou well
[1mTopic #2: [0mbenedek romantic liberti domin olivero

[96m[1mbenevolence[0m
[1mTopic #0: [0mtheori seem natur need thou

Scores are cumulated word-topic values directly obtained from the pretrained word-topic matrix (H). The reason of higher values in universalism or hedonism is probably unbalanced distribution of training documents. (Universalism and Hedonism have much more training documents than others)


In [17]:
get_pretrained_words(pre_trained_doc, word_count=10)

Unnamed: 0,universalism - word,universalism - score,benevolence - word,benevolence - score,conformity - word,conformity - score,tradition - word,tradition - score,security - word,security - score,power - word,power - score,achievement - word,achievement - score,hedonism - word,hedonism - score,stimulation - word,stimulation - score,self-direction - word,self-direction - score
0,group,1.968,one,0.693,show,0.859,particularli,0.567,signific,0.617,leadership,0.647,social,0.882,see,1.544,reason,0.576,use,0.899
1,state,1.458,sometim,0.683,use,0.846,passion,0.522,thu,0.561,articl,0.636,use,0.83,studi,1.253,risk,0.563,well,0.726
2,peopl,1.421,natur,0.668,thu,0.713,sinc,0.503,relat,0.536,use,0.628,peopl,0.799,time,1.246,indic,0.55,gener,0.674
3,human,1.277,theori,0.663,other,0.702,thing,0.499,two,0.531,may,0.567,theori,0.772,psycholog,1.106,land,0.507,take,0.662
4,term,1.235,view,0.654,time,0.658,antiquitatem,0.482,land,0.523,peopl,0.526,role,0.77,shock,1.1,fun,0.437,known,0.615
5,intern,1.216,two,0.648,note,0.565,law,0.464,play,0.522,power,0.496,lower,0.751,use,1.028,exo,0.394,made,0.586
6,unit,1.205,thought,0.635,sourc,0.56,preciou,0.452,evid,0.517,lower,0.467,term,0.686,self,0.996,miyazaki,0.275,resourc,0.542
7,form,1.147,need,0.612,refer,0.517,success,0.438,pollut,0.508,idea,0.462,relat,0.674,repres,0.993,declin,0.251,benedek,0.487
8,world,1.146,upon,0.605,great,0.515,help,0.393,someth,0.502,least,0.45,work,0.661,social,0.968,suffer,0.167,romantic,0.468
9,refer,1.101,seem,0.605,word,0.463,three,0.392,fals,0.485,compos,0.43,loan,0.644,seri,0.961,psycholog,0.164,liberti,0.465


Exports all word-score pairs in vocabulary (~33000 words)

In [11]:
export_pretrained_excel(pre_trained_doc, filepath='pretrained_words.xlsx')

KeyboardInterrupt: 

## Evaluating Different Documents

Adding two example documents to the test_corpus.

In [18]:
#Pope ted talk, https://www.ted.com/speakers/pope_francis
# US Department of Defense, https://www.defense.gov/About/
doc_names = ["pope.txt", "dod.txt", "https://www.nationalgeographic.com/science/space/solar-system/earth/", "https://sadasd", "asdasd"]
#doc_names = ["pope.txt", "dod.txt"]

In [19]:
def add_corpus_docs(doc_names, test_corpus, insigth_api_key):
    for doc in doc_names:
        if re.match("^(http|https)://", doc) is None:
            add_corpus_txt(doc, test_corpus)
        else:
            add_corpus_url(doc, insigth_api_key, test_corpus)

Crawling a website using InSight API and adding its text to test_corpus.

Always check the text, added to the corpus via add_corpus_url. Because websites can have unexpected embedded texts.

In [20]:
test_corpus = []
insigth_api_key = "" #needs to be filled
add_corpus_docs(doc_names, test_corpus, insigth_api_key)

Content Not Found - https://sadasd
File not found - asdasd


Evaluate model for the test_corpus.

In [21]:
W_test_norm, W_test_list, test_corpusPP, tfidf_test = evaluate_test_corpus(pre_trained_doc, test_corpus)

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1857
Cleaning process: Initial size of tokens = 1857
Reduction due to punctuations and stopwords = 1332.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1332
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

  


Results for test_corpus

(All values are multiplied by 100)

When "only_doc_words" parameter set to True, the table will only show words from the documents.

In [22]:
print_interactive_test_results(W_test_norm, W_test_list, test_corpus, test_corpusPP, doc_names, pre_trained_doc, purity_score = False, word_count = 10, only_doc_words=True)



In [17]:
df = export_to_excel(W_test_norm, test_corpus, doc_names, filepath = 'output.xlsx')
df.head()

Unnamed: 0,name,universalism,benevolence,conformity,tradition,security,power,achievement,hedonism,stimulation,self-direction,Text
0,pope.txt,19.643708,62.965842,77.557248,38.481883,17.2962,43.970087,44.737977,51.584388,41.782835,49.243369,"Good evening â€“ or, good morning, I am not su..."
1,dod.txt,88.126194,0.003423,10.455854,28.390529,78.130468,60.376097,32.600423,4.951889,32.506595,32.863906,\nOn behalf of the Secretary of Defense and De...
2,https://www.nationalgeographic.com/science/spa...,85.302025,11.082767,24.76866,5.908274,60.309391,64.362547,4.187538,31.662962,91.641003,52.770443,"Earth, our home planet, is the only planet in ..."
3,https://sadasd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,asdasd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [18]:
df = export_to_csv(W_test_norm, test_corpus, doc_names, filepath = 'output.csv')
df.head()

Unnamed: 0,name,universalism,benevolence,conformity,tradition,security,power,achievement,hedonism,stimulation,self-direction,Text
0,pope.txt,19.643708,62.965842,77.557248,38.481883,17.2962,43.970087,44.737977,51.584388,41.782835,49.243369,"Good evening â€“ or, good morning, I am not su..."
1,dod.txt,88.126194,0.003423,10.455854,28.390529,78.130468,60.376097,32.600423,4.951889,32.506595,32.863906,\nOn behalf of the Secretary of Defense and De...
2,https://www.nationalgeographic.com/science/spa...,85.302025,11.082767,24.76866,5.908274,60.309391,64.362547,4.187538,31.662962,91.641003,52.770443,"Earth, our home planet, is the only planet in ..."
3,https://sadasd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,asdasd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [36]:
# When word_count is -1, it exports all the words
# When only_doc_words is set to True, it exports only the words used in the documents

# if you want proper document names in the output file change 'doc_names' list.
export_word_scores_excel(W_test_norm, W_test_list, doc_names, pre_trained_doc, filepath = 'ssnmf_words.xlsx', purity_score=False, word_count=-1, only_doc_words=True)



In [22]:
# Exports tf-idf scores of the words that are used in the documents as a single xlsx file
export_doc_tfidf_scores(tfidf_test, doc_names, pre_trained_doc, filepath = 'tfidf_docs.xlsx')