# Theme Supervised Nonnegative Matrix Factorization for Topic Modeling

Import Libraries

In [1]:
import pandas as pd
import numpy as np
from time import time
import random

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA

import matplotlib.pylab as plt

import libs.text_preprocess as tp
import libs.genetic_algorithm as ga
from libs.TSNMF_Class import TSNMF



In [2]:
from nltk.corpus import reuters
from nltk.corpus import brown

### Read Data

Run one of the cells below

* index
* theme
* text

### Schwart Dataset

In [3]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data['theme'] = data['theme'].apply(lambda x: [x])
    data = data.sort_values('theme')
    data = data[['title', 'theme', 'text']]
    data = data.rename({'title': 'id'}, axis=1)
    
    return data.reset_index(drop=True)

#https://github.com/bulentozel/OpenMaker/blob/master/Semantics/data/corpuses/schwartz.json
# schwartz.json or pruned_schwartz.json
filepath = 'pruned_schwartz.json'

data = read_data(filepath)

dataset = 'schwartz'

### Reuters Dataset

In [19]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(' '.join(reuters.words(file)))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
data = pd.DataFrame({'id':fileids, 'theme':categories, 'text':text}).sort_values('theme').reset_index(drop=True)

dataset = 'reuters'

### Brown Dataset

In [4]:
# Extract fileids from the reuters corpus
fileids = brown.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(brown.categories(file))
    text.append(' '.join(brown.words(file)))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
data = pd.DataFrame({'id':fileids, 'theme':categories, 'text':text}).sort_values('theme').reset_index(drop=True)

dataset = 'brown'

### Clean Texts
1. Fix bad wording: isn't -> is not
2. Clean and Tokenize -> min word len = 3, tokenize
3. Stopword Removal -> nltk.stopwords
4. Lemmatization -> WordNet Lemmatizer

In [4]:
data['text'] = data.text.apply(tp.clean_text)

In [5]:
data = data[data['text'].apply(lambda x: x.count(' ') > 25)].reset_index(drop=True).copy(deep=True)

## Training TSNMF Model

In [8]:
tsnmf_model = TSNMF(data = data, supervision = 'supervised', separate_models = True, bCool_init = True, train_test_split = [0.4, 0.6],
                    n_topics = 3, n_terms = 10000, background_for_theme = True, background_scoring = True,
                    beta_loss = 'kullback-leibler', term_vectorizer = 'tf', random_state = 6)

t0 = time()

tsnmf_model = tsnmf_model.split_train_test()

print("Training.. ", end='')
t1 = time()
tsnmf_context = tsnmf_model.fit()
print("%0.2fs - " % (time() - t1), end='')

print("Testing.. ", end='')
t1 = time()
tsnmf_context = tsnmf_model.evaluate_test_corpus(tsnmf_context)
print("%0.2fs - " % (time() - t1), end='')

tsnmf_context['tsnmf_model'] = tsnmf_model
print("Done:  %0.2fs" % (time() - t0))

Training.. 11.95s - Testing.. 18.19s - Done:  30.17s


### Scoring

In [13]:
def calculate_test_score(tsnmf_context):
    themes = tsnmf_context['tsnmf_model'].themes
    
    temp_pred_scores = []
    for ind, doc_wth in enumerate(tsnmf_context['W_test_high']):
        temp_pred_score = []
        for theme in tsnmf_context['tsnmf_model'].test_data.iloc[ind]['theme']:
            theme_id = themes.index(theme)
            temp_pred_score.append(np.log(len(themes))-np.log(np.argwhere(doc_wth.argsort()[::-1]==theme_id)[0][0] + 1))
        temp_pred_scores.append(temp_pred_score)

    return np.array([sum(tps) for tps in temp_pred_scores])


In [32]:
result_dict_keys = ['dataset','theme_count','train_set_size','test_set_size','train_split','supervision','separate_models','bCool_init',
                    'beta_loss','term_vectorizer','max_score_one_theme','max_score','prediction_score','prediction_score_perc','prediction_score_mean',
                    'prediction_score_std']

result_df = pd.DataFrame(columns=result_dict_keys)

In [33]:
prediction_scores = calculate_test_score(tsnmf_context)

n_themes = len(tsnmf_context['tsnmf_model'].themes)

result_dict = {
    'dataset': dataset,
    'theme_count': n_themes,
    'train_set_size': len(tsnmf_context['tsnmf_model'].train_data),
    'test_set_size': len(tsnmf_context['tsnmf_model'].test_data),
    'train_split': tsnmf_context['tsnmf_model'].train_test_split[0],
    'supervision': tsnmf_context['tsnmf_model'].supervision,
    'separate_models': 'separate' if tsnmf_context['tsnmf_model'].separate_models else 'combined',
    'bCool_init': 'bCool' if tsnmf_context['tsnmf_model'].bCool_init else 'random',
    'beta_loss': tsnmf_context['tsnmf_model'].beta_loss,
    'term_vectorizer': tsnmf_context['tsnmf_model'].term_vectorizer,
    'max_score_one_theme': np.log(n_themes),
    'max_score': 0,
    'prediction_score': 0,
    'prediction_score_perc': 0,
    'prediction_score_mean': 0,
    'prediction_score_std': 0
}

max_score = sum([sum(np.log(n_themes)-np.log(range(1, len(aps)+1))) for aps in tsnmf_context['tsnmf_model'].test_data['theme']])
result_dict['max_score'] = max_score
result_dict['prediction_score'] = prediction_scores.sum()
result_dict['prediction_score_perc'] = 100*prediction_scores.sum()/max_score
result_dict['prediction_score_mean'] = prediction_scores.mean()
result_dict['prediction_score_std'] = prediction_scores.std()

result_df = result_df.append(result_dict, ignore_index=True)

In [34]:
result_df

Unnamed: 0,dataset,theme_count,train_set_size,test_set_size,train_split,supervision,separate_models,bCool_init,beta_loss,term_vectorizer,max_score_one_theme,max_score,prediction_score,prediction_score_perc,prediction_score_mean,prediction_score_std
0,schwartz,10,173,260,0.4,supervised,separate,bCool,kullback-leibler,tf,2.302585,598.672124,480.543134,80.268166,1.848243,0.645351


## Classical NMF-LDA

In [46]:
t0 = time()

tsnmf_model = TSNMF(data = data, supervision = 'semi_supervised', train_test_split = [0.4, 0.6], random_state = 6)

if tsnmf_model.train_test_split[0] > 0.25 and tsnmf_model.train_test_split[0] < 0.75:
    tsnmf_model = tsnmf_model.split_train_test()
else:
    tsnmf_model = tsnmf_model.split_train_test_forced()

train_data = tsnmf_model.train_data.copy(deep=True)
test_data = tsnmf_model.test_data.copy(deep=True)

print("Training.. ", end='')
t1 = time()

corpus = list(train_data.text)
term_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,3), max_features=10000)
tf = term_vectorizer.fit_transform(corpus)

model = NMF(n_components = len(tsnmf_model.themes), solver='mu', beta_loss='kullback-leibler', alpha=.1, l1_ratio=.5)
W = model.fit_transform(X=tf)
model_type = 'NMF'

#model = LDA(n_components=len(tsnmf_model.themes))
#W = model.fit_transform(tf)
#model_type = 'LDA'

print("%0.2fs - " % (time() - t1), end='')

print("Genetic Algorithm.. ", end='')
t1 = time()

population, solution, solution_ind, solution_obj = ga.run_ga(train_data[train_data.labeled == 1], W, tsnmf_model.themes, stopGeneration=100)

print("%0.2f, %0.2fs - " % (solution_obj, time() - t1), end='')

nmflda_context = {
    'type': model_type,
    'model': model,
    'W': W,
    'solution': solution,
    'solution_obj': solution_obj,
    'data': train_data,
    'beta_loss': 'none',
    'term_vectorizer': 'tf',
    'train_perc': tsnmf_model.train_test_split[0],
    'random_state': tsnmf_model.random_state
}

print("Done:  %0.2fs" % (time() - t0))

Training.. 20.20s - Genetic Algorithm.. 209.57, 11.20s - Done:  31.42s


### Scoring

In [53]:
result_dict_keys = ['type', 'dataset','theme_count','train_set_size','test_set_size','train_split', 'term_vectorizer','max_score_one_theme',
                    'max_score','prediction_score','prediction_score_perc','prediction_score_mean', 'prediction_score_std']

result_df = pd.DataFrame(columns=result_dict_keys)

In [54]:
themes = sorted(list(set(nmflda_context['data']['theme'].sum())))
n_themes = len(themes)
result_dict = {
    'type': nmflda_context['type'],
    'dataset': dataset,
    'theme_count': n_themes,
    'train_set_size': len(nmflda_context['data'][nmflda_context['data'].labeled == 1]),
    'test_set_size': len(nmflda_context['data'][nmflda_context['data'].labeled == 0]),
    'train_split': nmflda_context['train_perc'],
    'term_vectorizer': nmflda_context['term_vectorizer'],
    'max_score_one_theme': np.log(n_themes),
    'max_score': 0,
    'prediction_score': 0,
    'prediction_score_perc': 0,
    'prediction_score_mean': 0,
    'prediction_score_std': 0
}

prediction_scores, _ = ga.calculateTestScore(nmflda_context['solution'], nmflda_context['data'][nmflda_context['data'].labeled == 0], nmflda_context['W'], themes)
prediction_scores = np.array(prediction_scores)

max_score = sum([sum(np.log(n_themes)-np.log(range(1, len(aps)+1))) for aps in nmflda_context['data'][nmflda_context['data'].labeled == 0]['theme']])
result_dict['max_score'] = max_score
result_dict['prediction_score'] = prediction_scores[i].sum()
result_dict['prediction_score_perc'] = 100*prediction_scores[i].sum()/max_score
result_dict['prediction_score_mean'] = prediction_scores[i].mean()
result_dict['prediction_score_std'] = prediction_scores[i].std()

result_df = result_df.append(result_dict, ignore_index=True)

In [55]:
result_df

Unnamed: 0,type,dataset,theme_count,train_set_size,test_set_size,train_split,term_vectorizer,max_score_one_theme,max_score,prediction_score,prediction_score_perc,prediction_score_mean,prediction_score_std
0,NMF,schwartz,10,173,260,0.4,tf,2.302585,598.672124,1.609438,0.268835,1.609438,0.0


## Term Scoring

In [56]:
def _row_norm(dists):
    # row normalization function required
    # for doc_topic_dists and topic_term_dists
    return dists / dists.sum(axis=1)[:, None]


def _direct_term_scores(W, H, X):
    '''
    W: doc_topic_dists (n_doc x n_topic)
    H: topic_term_dists (n_topic x n_term)
    H: doc_topic_dists (n_doc x n_term)
    '''
    
    direct_term_scores = []
    
    for i, W_row in enumerate(W):
        direct_term_scores.append((X[i].toarray()>0) * (H * W_row[:, None]))
    
    return np.array(direct_term_scores)

def _purity_term_scores(direct_term_scores):
    purity_term_scores = []
    
    for i in range(len(direct_term_scores)):
        purity_term_scores.append(direct_term_scores[i] * (direct_term_scores[i] / (direct_term_scores[i]+direct_term_scores[-1])))
        
    return np.nan_to_num(np.array(purity_term_scores))


def calculate_term_scores(H, purity_ratio):
    direct_term_scores = _row_norm(H)
    #direct_term_scores = H
    purity_term_scores = _purity_term_scores(direct_term_scores)
    
    term_scores = (1-purity_ratio) * direct_term_scores + purity_ratio * purity_term_scores
    
    return np.array(term_scores)


def calculate_doc_term_scores(W, H, X, purity_ratio):
    direct_term_scores = _direct_term_scores(_row_norm(W), _row_norm(H), X)
    purity_term_scores = _purity_term_scores(direct_term_scores)
    
    docs_term_scores = (1-purity_ratio) * direct_term_scores + purity_ratio * purity_term_scores
    
    return np.array(docs_term_scores)

In [59]:
def print_top_words(topic_term_dists, feature_names, n_top_words):
    for topic_idx, topic in enumerate(topic_term_dists):
        print("Topic #%d:" % topic_idx)
        print(" - ".join([feature_names[i] + '(' + '%.4f'%topic_term_dists[topic_idx][i] + ')' for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    
def return_top_words(topic_term_dists, feature_names, n_top_words):
    terms_list = []
    for topic_idx, topic in enumerate(topic_term_dists):
        terms_list.append([feature_names[i] + '(' + '%.4f'%topic_term_dists[topic_idx][i] + ')' for i in topic.argsort()[:-n_top_words - 1:-1]])
    return terms_list

In [63]:
def get_info(model):
    if 'tsnmf_model' in model:
        info_dict = {
            'dataset': dataset,
            'type': 'TSNMF',
            'train_split': model['tsnmf_model'].train_test_split[0],
            'supervision': model['tsnmf_model'].supervision,
            'separate_models': 'separate' if model['tsnmf_model'].separate_models else 'combined',
            'subtopic_count': model['W_list'][0].shape[1]-1,
            'theme_count': len(model['tsnmf_model'].themes),
            'train_set_size': len(model['tsnmf_model'].train_data),
            'test_set_size': len(model['tsnmf_model'].test_data),
            'bCool_init': 'bCool' if model['tsnmf_model'].bCool_init else 'random',
            'beta_loss': model['tsnmf_model'].beta_loss,
            'term_vectorizer': model['tsnmf_model'].term_vectorizer,
            'solution': '-'
        }
    else:
        model['tf'] = tf
        model['tf_vectorizer'] = term_vectorizer
        info_dict = {
            'dataset': dataset,
            'type': model['type'],
            'train_split': model['train_perc'],
            'supervision': '-',
            'separate_models': '-',
            'subtopic_count': 1,
            'theme_count': model['W'].shape[1],
            'train_set_size': len(model['data']),
            'test_set_size': len(model['data'][model['data']['labeled'] == 0]),
            'bCool_init': '-',
            'beta_loss': model['beta_loss'],
            'term_vectorizer': model['term_vectorizer'],
            'solution': model['solution'] 
        }
    return info_dict

In [61]:
def get_term_scores(model, info_dict, purity_ratio, theme_id=0):
    if 'tsnmf_model' in model:
        if info_dict['separate_models'] == 'combined':
            term_scores = calculate_term_scores(model['tsnmf'].components_, purity_ratio=purity_ratio)
            doc_term_scores = calculate_doc_term_scores(model['W'], model['tsnmf'].components_, model['tf'], purity_ratio=purity_ratio)
        else:
            term_scores = calculate_term_scores(model['tsnmf_list'][theme_id].components_, purity_ratio=purity_ratio)
            doc_term_scores = calculate_doc_term_scores(model['W_list'][theme_id], model['tsnmf_list'][theme_id].components_, model['tf'], purity_ratio=purity_ratio)
    else:
        term_scores = calculate_term_scores(model['model'].components_, purity_ratio=0)
        doc_term_scores = calculate_doc_term_scores(model['W'], model['model'].components_, model['tf'], purity_ratio=0)
        
    return term_scores, doc_term_scores

In [73]:
purity_ratio = 1
term_count = 10
model = tsnmf_context
result_df = pd.DataFrame()
info_dict = get_info(model)

if 'tsnmf_model' in model and info_dict['separate_models'] == 'separate':
    for theme_id, theme in enumerate(themes):
        term_scores, _ = get_term_scores(model, info_dict, purity_ratio, theme_id)

        terms_list = return_top_words(term_scores, model['tf_vectorizer'].get_feature_names(), term_count)
        terms_list_sum = return_top_words(np.sum(term_scores[:-1],0)[None, :], model['tf_vectorizer'].get_feature_names(), term_count)[0]
        terms_list_max = return_top_words(np.max(term_scores[:-1],0)[None, :], model['tf_vectorizer'].get_feature_names(), term_count)[0]

        for i, terms in enumerate(terms_list):
            result_df.insert(len(result_df.columns), theme+'_'+str(i+1), terms)
        result_df.insert(len(result_df.columns), theme+'_sum', terms_list_sum)
        result_df.insert(len(result_df.columns), theme+'_max', terms_list_max)

else:
    term_scores, _ = get_term_scores(model, info_dict, 0)
    terms_list = return_top_words(term_scores, model['tf_vectorizer'].get_feature_names(), term_count)
    for subtopic_id, terms in enumerate(terms_list):
        result_df.insert(len(result_df.columns), themes[model['solution'].index(subtopic_id)], terms)





In [75]:
result_df

Unnamed: 0,achievement_1,achievement_2,achievement_3,achievement_4,achievement_sum,achievement_max,benevolence_1,benevolence_2,benevolence_3,benevolence_4,...,tradition_3,tradition_4,tradition_sum,tradition_max,universalism_1,universalism_2,universalism_3,universalism_4,universalism_sum,universalism_max
0,capital(0.0207),bystander(0.0100),status(0.0253),one(0.0026),social(0.0365),status(0.0253),law(0.0241),truth(0.0250),pardon(0.0141),social(0.0028),...,humility(0.0495),social(0.0027),virtue(0.0507),humility(0.0495),environmental(0.0102),energy(0.0149),green(0.0105),one(0.0031),environmental(0.0184),energy(0.0149)
1,social(0.0141),individual(0.0082),social(0.0144),social(0.0022),status(0.0296),capital(0.0207),ethic(0.0166),theory(0.0106),rand(0.0096),one(0.0025),...,christian(0.0446),one(0.0026),tradition(0.0498),virtue(0.0454),movement(0.0065),peace(0.0087),environmental(0.0082),social(0.0029),energy(0.0158),green(0.0105)
2,stratification(0.0081),impression(0.0082),influence(0.0093),use(0.0021),capital(0.0213),social(0.0144),natural(0.0122),altruism(0.0088),loyalty(0.0096),use(0.0021),...,view(0.0156),use(0.0021),humility(0.0495),christian(0.0446),socialism(0.0055),displaystyle(0.0062),party(0.0065),may(0.0025),green(0.0118),environmental(0.0102)
3,society(0.0079),social(0.0079),attainment(0.0083),state(0.0020),individual(0.0158),bystander(0.0100),natural law(0.0111),true(0.0073),responsibility(0.0061),may(0.0020),...,god(0.0143),may(0.0020),christian(0.0456),tradition(0.0432),socialist(0.0055),abduction(0.0047),think(0.0057),people(0.0021),party(0.0104),peace(0.0087)
4,accumulation(0.0077),group(0.0068),ascribe(0.0073),may(0.0020),society(0.0152),influence(0.0093),good(0.0070),help(0.0065),grant(0.0055),state(0.0020),...,jesus(0.0121),state(0.0020),ethic(0.0299),ascetic(0.0231),woman(0.0044),advaita(0.0041),convention(0.0051),also(0.0021),movement(0.0096),party(0.0065)
5,production(0.0065),management(0.0065),society(0.0072),also(0.0019),influence(0.0114),attainment(0.0083),moral(0.0060),one(0.0053),one(0.0051),also(0.0019),...,christianity(0.0113),also(0.0019),practice(0.0251),moral(0.0219),right(0.0043),peirce(0.0038),conservation(0.0050),individual(0.0020),peace(0.0090),movement(0.0065)
6,class(0.0060),male(0.0060),individual(0.0062),people(0.0017),people(0.0102),individual(0.0082),nature(0.0057),theory truth(0.0048),sentence(0.0041),environmental(0.0018),...,ethic(0.0106),people(0.0018),ascetic(0.0231),practice(0.0205),economic(0.0041),nobel(0.0036),green party(0.0050),theory(0.0020),world(0.0075),displaystyle(0.0062)
7,value(0.0054),impression management(0.0058),ascribe status(0.0061),environmental(0.0017),group(0.0100),impression(0.0082),philosophy(0.0048),say(0.0035),diffusion(0.0040),people(0.0017),...,love(0.0098),theory(0.0016),moral(0.0227),asceticism(0.0199),sustainable(0.0040),form(0.0032),environment(0.0049),use(0.0020),resource(0.0073),think(0.0057)
8,economic(0.0053),dominance(0.0056),foucault(0.0057),theory(0.0016),bystander(0.0100),stratification(0.0081),right(0.0045),love(0.0034),diffusion responsibility(0.0037),group(0.0016),...,life(0.0094),group(0.0016),asceticism(0.0199),ethic(0.0193),feminist(0.0039),reality(0.0031),natural(0.0049),group(0.0019),think(0.0071),socialism(0.0055)
9,business(0.0050),help(0.0052),status attainment(0.0056),include(0.0015),high(0.0100),society(0.0079),sin(0.0038),people(0.0034),objectivism(0.0036),individual(0.0016),...,one(0.0077),individual(0.0016),one(0.0197),character(0.0167),party(0.0039),hypothesis(0.0031),resource(0.0047),state(0.0018),use(0.0069),socialist(0.0055)
