In [10]:
import pandas as pd
import numpy as np
from time import time

import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import matplotlib.pyplot as plt
from math import pi

from omterms.interface import *

from ipywidgets import interact, fixed

## Plots and Prints

In [2]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

def plot_radar_chart(doc_topic_cumul, doc):
    # ------- PART 1: Create background
 
    # number of variablecategories
    schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']
    
    schwartz_dist = []
    for sch in schwartz:
        schwartz_dist.append(doc_topic_cumul[doc][categories.index(sch)])
    
    N = len(schwartz)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    plt.figure(figsize=(8,8))
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)

    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], schwartz)

    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([25,50,75], ["25","50","75"], color="grey", size=7)
    plt.ylim(0,100)


    # ------- PART 2: Add plots

    # Plot each individual = each line of the data
    # I don't do a loop, because plotting more than 3 groups makes the chart unreadable

    # Ind1
    values = list(schwartz_dist) + list(schwartz_dist[:1])
    ax.plot(angles, values, linewidth=1, linestyle='solid')
    ax.fill(angles, values, 'b', alpha=0.1)

    # Add legend
    #plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title("Schwartz Chart - Doc " + str(doc))
    plt.show()
    
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
    
def print_top_words(model, tfidf_vectorizer, n_top_words, n_topics=3):
    feature_names = tfidf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        if topic_idx % n_topics == 0:
            try:
                print(color.CYAN + color.BOLD + categories[topic_idx//3] + color.END)
            except:
                print(color.CYAN + color.BOLD + "General" + color.END)
        message = color.BOLD + "Topic #%d: " % topic_idx + color.END
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        if (topic_idx+1) % n_topics == 0:
            print()
    print()
    
def print_cumulative_train_doc_topics(data, doc_topic, doc, n_best):
    test_theme = data.iloc[doc]['theme']
    print(color.BOLD + "Doc " + str(doc) + color.RED +  " (" + test_theme + ")\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()
    
def print_cumulative_test_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()

def print_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    for i in doc_topic[doc].argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i//3] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, doc_topic[doc][i]), end='')    
    print()

def prin_train_results(doc_topic, doc, corpus, data):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_train_doc_topics(data, doc_topic, doc, 11) 
    print()
    
    plot_radar_chart(doc_topic, doc)
    
def prin_test_results(doc_topic, doc, corpus):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_test_doc_topics(doc_topic, doc, 11)
    print()
    
    plot_radar_chart(doc_topic, doc)
    
    

## Helper Functions

In [3]:
def build_W(N, n_topics, n_themes, theme_counts):
    rands = np.random.random( N * n_topics * (n_themes+1))
    W = np.zeros((N, n_topics * n_themes))

    cum_doc_count = 0
    idx = 0
    for theme, doc_count in theme_counts.items():
        #print("Theme: " + str(theme) + " Doc_count: " + str(doc_count))
        start = cum_doc_count
        end = start + doc_count
        W[start:end, idx*n_topics:(idx+1)*n_topics] = rands[:(end-start)*n_topics].reshape((end-start, n_topics))
        listrands = list(rands)
        del listrands[:(end-start)*n_topics]
        rands = np.array(listrands)
        
        cum_doc_count += doc_count
        idx +=1

    last_column = rands[- N * n_topics:].reshape((N, n_topics))
    
    return np.column_stack((W, last_column))

def cumulate_W(W, n_topics):
    W_cumul = []
    for d in W:
        temp = []
        for i in range(W.shape[1]//n_topics):
            temp.append(d[i*n_topics:(i+1)*n_topics].sum())
        W_cumul.append(temp)

    W_cumul = np.asarray(W_cumul)
    
    return W_cumul

def normalize_W(W):
    W_cumul_norm = W/(W.sum(axis=1).reshape(W.shape[0], 1))
    W_cumul_norm *= 100
    
    return W_cumul_norm

def export_to_excel(W, docs, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put xlsx as file extension '''
    
    df = pd.DataFrame(data=W,index = range(len(W)), columns=categories+['general'])
    df['Text'] = docs
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_excel(filepath)
    return df

def export_to_csv(W, docs, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put csv as file extension '''
    
    df = pd.DataFrame(data=W,index = range(len(W)), columns=categories+['general'])
    df['Text'] = docs
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_csv(filepath)
    return df

## Main Functions

In [4]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data = data.sort_values('theme.id')
    
    return data
    
def extract_corpus(data):    
    corpus = list(data['text'])
    return corpus

def preprocess_corpus(corpus):
    PPcorpus = [' '.join(list((extract_terms(doc, extra_process = ['stem'])['Stem']+' ')*extract_terms(doc, extra_process = ['stem'])['TF'])) for doc in corpus]
    return PPcorpus

def train_corpus(corpus, data, n_topics=3, betaloss = 'kullback-leibler'):
    N = len(data)
    
    theme_counts = data.groupby(['theme.id','theme']).count().iloc[:,1]
    pd_theme_counts = pd.DataFrame(theme_counts)
    n_themes = len(theme_counts)
    
    n_top_words = 5
    n_components = n_topics*(n_themes)
    
    
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer() # optionally add maxfeatures = n_features to enforce number of features
    t0 = time()
    tfidf = tfidf_vectorizer.fit_transform(corpus)
    n_features = tfidf.shape[1]
    print("done in %0.2fs." % (time() - t0))
    
    X = tfidf 
    W = build_W(N, n_topics, n_themes, theme_counts)
    H = np.random.rand(n_components+n_topics, n_features)
    
    # Fit the NMF model
    print("Fitting the NMF model (" + betaloss + ") with tf-idf features, "
          "n_samples=%d and n_features=%d..."
          % (N, n_features))
    t0 = time()

    nmf = NMF(n_components= n_components+n_topics, solver='mu', beta_loss=betaloss,
              alpha=.1, l1_ratio=.5, init = 'custom')

    nmf.fit_transform(X=X,W=W,H=H)
    print("done in %0.2fs." % (time() - t0))
    
    return nmf, W, tfidf, tfidf_vectorizer
    
def evaluate_docs(docs, nmf, tfidf_vectorizer, betaloss = 'kullback-leibler'):
    print("Extracting tf-idf features for NMF...")
    t0 = time()
    tfidf_test = tfidf_vectorizer.transform(docs)
    #tfidf = tfidf_vectorizer.transform(corpusX)
    n_features = tfidf_test.shape[1]
    print("done in %0.2fs." % (time() - t0))
    
    X_test = tfidf_test
    H_test = nmf.components_
    
    
    # Fit the NMF model
    print("Fitting the NMF model (" + betaloss + ") with tf-idf features, ")
    t0 = time()

    W_test = nmf.transform(X_test)
    print("done in %0.2fs." % (time() - t0))
    
    return W_test, tfidf_test

## Training Model

In [None]:
#https://github.com/bulentozel/OpenMaker/blob/master/Semantics/data/corpuses/schwartz.json
filepath = 'schwartz.json'

data = read_data(filepath)
corpus = extract_corpus(data)
corpusPP = preprocess_corpus(corpus)

In [6]:
nmf, W_train, tfidf_train, tfidf_vectorizer = train_corpus(corpusPP, data, n_topics=3, betaloss = 'kullback-leibler')

Extracting tf-idf features for NMF...
done in 0.77s.
Fitting the NMF model (kullback-leibler) with tf-idf features, n_samples=494 and n_features=36464...
done in 22.85s.


In [7]:
print("\nTopics in NMF model:")
print_top_words(nmf, tfidf_vectorizer, n_top_words=5, n_topics=3)


Topics in NMF model:
[96m[1muniversalism[0m
[1mTopic #0: [0morgan form specif mandatori crew
[1mTopic #1: [0mphilosophi good new two impact
[1mTopic #2: [0mdisarma manifest law explor relat

[96m[1mhedonism[0m
[1mTopic #3: [0mtime surpris refer sever thing
[1mTopic #4: [0muse self see pleasur piti
[1mTopic #5: [0moutrag shown peopl standard philosophi

[96m[1machievement[0m
[1mTopic #6: [0mthree mean recent offer report
[1mTopic #7: [0minterest use greater platform someon
[1mTopic #8: [0mplace properti intergener return work

[96m[1mpower[0m
[1mTopic #9: [0mbia sometim liu trivial throughout
[1mTopic #10: [0marticl wangchuck compos highli idea
[1mTopic #11: [0mmoham troubl guid use fusion

[96m[1mself-direction[0m
[1mTopic #12: [0msecess non domin serbia muslim
[1mTopic #13: [0mproject gener liberti carrol two
[1mTopic #14: [0mbenedek train photographi burkina right

[96m[1mbenevolence[0m
[1mTopic #15: [0midea valu polici shown taken


In [8]:
# Sum up sub topics
W_train_cumul = cumulate_W(W_train, n_topics=3)
W_train_norm = normalize_W(W_train_cumul)

In [12]:
interact(prin_train_results, doc_topic=fixed(W_train_norm), doc = (0, len(W_train_norm)-1, 1), corpus=fixed(corpus), data=fixed(data))

<function __main__.prin_train_results>

In [13]:
df = export_to_excel(W_train_norm, corpus, filepath = 'output.xlsx')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security,general
0,Critical thinking \n Sculpture of Socrates \n ...,0.060927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.939073
1,Environmental justice \n This article has mult...,77.209201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.790799
2,"Natural resource \n ""Primary resource"" redirec...",13.842011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86.157989
3,"Ceasefire \n ""Truce"" redirects here For other ...",0.516674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.483326
4,International community \n The \n internationa...,0.002972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.997028


In [14]:
df = export_to_csv(W_train_norm, corpus, filepath = 'output.csv')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security,general
0,Critical thinking \n Sculpture of Socrates \n ...,0.060927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.939073
1,Environmental justice \n This article has mult...,77.209201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.790799
2,"Natural resource \n ""Primary resource"" redirec...",13.842011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86.157989
3,"Ceasefire \n ""Truce"" redirects here For other ...",0.516674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.483326
4,International community \n The \n internationa...,0.002972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.997028


## Evaluating Different Documents

To evaluate your documents, simply append them to _docs list_ as a whole string.

Two example documents.

In [15]:
test_corpus = []
f = open("pope.txt", "r") #Pope ted talk, https://www.ted.com/speakers/pope_francis
pope = f.read()
test_corpus.append(pope)
f.close()

f = open("dod.txt", "r")  # US Department of Defense, https://www.defense.gov/About/
dod = f.read()
test_corpus.append(dod)
f.close()

In [None]:
test_corpusPP = preprocess_corpus(test_corpus)

In [17]:
W_test, tfidf_test = evaluate_docs(test_corpusPP, nmf, tfidf_vectorizer, betaloss = 'kullback-leibler')

Extracting tf-idf features for NMF...
done in 0.00s.
Fitting the NMF model (kullback-leibler) with tf-idf features, 
done in 0.33s.


In [18]:
W_test_cumul = cumulate_W(W_test, n_topics=3)
W_test_norm = normalize_W(W_test_cumul)

In [19]:
interact(prin_test_results, doc_topic=fixed(W_test_norm), doc = (0, len(W_test_norm)-1, 1), corpus=fixed(test_corpus))

<function __main__.prin_test_results>

In [20]:
df = export_to_excel(W_test_norm, test_corpus, filepath = 'output.xlsx')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security,general
0,"Good evening â€“ or, good morning, I am not su...",3.210163,7.430994,4.855774,6.453324,0.137821,20.459419,27.277332,9.677749,3.74642,5.519147,11.231858
1,\nOn behalf of the Secretary of Defense and De...,19.748271,0.946942,8.978493,16.955744,13.825183,0.004017,2.1e-05,1.3e-05,12.855169,26.148252,0.537895


In [21]:
df = export_to_csv(W_test_norm, test_corpus, filepath = 'output.csv')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security,general
0,"Good evening â€“ or, good morning, I am not su...",3.210163,7.430994,4.855774,6.453324,0.137821,20.459419,27.277332,9.677749,3.74642,5.519147,11.231858
1,\nOn behalf of the Secretary of Defense and De...,19.748271,0.946942,8.978493,16.955744,13.825183,0.004017,2.1e-05,1.3e-05,12.855169,26.148252,0.537895
