# Thesis

In [1]:
import pandas as pd
import numpy as np
from time import time
import random

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA

import matplotlib.pylab as plt

import pickle

import libs.text_preprocess as tp
import libs.genetic_algorithm as ga
from libs.TSNMF_Class import TSNMF

# https://github.com/bmabey/pyLDAvis/blob/master/pyLDAvis/_prepare.py
import pyLDAvis.gensim
import pyLDAvis.sklearn
import pyLDAvis

import os

pyLDAvis.enable_notebook()



In [2]:
from nltk.corpus import reuters
from nltk.corpus import brown

## Read Data

Run one of the cells below

* index
* theme
* text

### Schwart Dataset

In [3]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data['theme'] = data['theme'].apply(lambda x: [x])
    data = data.sort_values('theme')
    data = data[['title', 'theme', 'text']]
    data = data.rename({'title': 'id'}, axis=1)
    
    return data.reset_index(drop=True)

#https://github.com/bulentozel/OpenMaker/blob/master/Semantics/data/corpuses/schwartz.json
# schwartz.json or pruned_schwartz.json
filepath = 'pruned_schwartz.json'

data = read_data(filepath)

### Reuters Dataset

In [100]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(' '.join(reuters.words(file)))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
data = pd.DataFrame({'id':fileids, 'theme':categories, 'text':text}).sort_values('theme').reset_index(drop=True)

### Brown Dataset

In [6]:
# Extract fileids from the reuters corpus
fileids = brown.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(brown.categories(file))
    text.append(' '.join(brown.words(file)))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
data = pd.DataFrame({'id':fileids, 'theme':categories, 'text':text}).sort_values('theme').reset_index(drop=True)

## Clean Texts
1. Fix bad wording: isn't -> is not
2. Clean and Tokenize -> min word len = 3, tokenize
3. Stopword Removal -> nltk.stopwords
4. Lemmatization -> WordNet Lemmatizer

In [4]:
data['text'] = data.text.apply(tp.clean_text)

In [5]:
data.head()

Unnamed: 0,id,theme,text
0,Ascribed status,[achievement],ascribe status part series political legal ant...
1,Relational capital,[achievement],relational capital redirect relational capital...
2,Human resources,[achievement],human resource us see human resource disambigu...
3,Intellectual capital,[achievement],intellectual capital intellectual capital inta...
4,Expenses versus Capital Expenditures,[achievement],expense versus capital expenditure redirect ca...


In [6]:
dataX = data[data['text'].apply(lambda x: x.count(' ') > 25)].copy(deep=True)

In [7]:
len(data), len(dataX), len(sorted(list(set(dataX['theme'].sum()))))

(434, 433, 10)

In [8]:
labels = {}
for i, row in dataX.iterrows():
    for t in row['theme']:
        if t not in labels:
            labels[t] = 1
        else:
            labels[t] += 1
            
#labels

In [9]:
data = data[data['text'].apply(lambda x: x.count(' ') > 25)].reset_index(drop=True).copy(deep=True)

## Analysis Tables

### NMF-LDA

In [34]:
result_df = pd.DataFrame(columns=result_dict_keys)
for idx, pickle_name in enumerate(os.listdir("all_pickles/pickles_schwartz_nmflda")):
    print(idx, pickle_name)
    pickle_in = open("all_pickles/pickles_schwartz_nmflda/"+pickle_name,"rb")
    nmflda_context_list = pickle.load(pickle_in)
    pickle_in.close()
    
    themes = sorted(list(set(nmflda_context_list[0]['data']['theme'].sum())))
    n_themes = len(themes)
    
    
    break

0 reuters_lda_tf_10.pickle
1 reuters_lda_tf_20.pickle
2 reuters_lda_tf_30.pickle
3 reuters_lda_tf_40.pickle
4 reuters_lda_tf_50.pickle
5 reuters_lda_tf_60.pickle
6 reuters_lda_tf_70.pickle
7 reuters_lda_tf_80.pickle
8 reuters_lda_tf_90.pickle
9 reuters_nmf_kl_tf_10.pickle
10 reuters_nmf_kl_tf_20.pickle
11 reuters_nmf_kl_tf_30.pickle
12 reuters_nmf_kl_tf_40.pickle
13 reuters_nmf_kl_tf_50.pickle
14 reuters_nmf_kl_tf_60.pickle
15 reuters_nmf_kl_tf_70.pickle
16 reuters_nmf_kl_tf_80.pickle
17 reuters_nmf_kl_tf_90.pickle


### TSNMF Models

In [3]:
for idx, pickle_name in enumerate(os.listdir("all_pickles/pickles_schwartz")):
    print(idx, pickle_name)
    pickle_in = open("all_pickles/pickles_schwartz/"+pickle_name,"rb")
    tsnmf_context_list = pickle.load(pickle_in)
    pickle_in.close()
    
    break

0 schwartz_semi_supervised_combined_bCool_kullback-leibler_tf_10.pickle


## Term Scoring

In [10]:
dataset_name = 'schwartz'

In [61]:
def calculate_test_score(tsnmf_context_list):
    themes = tsnmf_context_list[0]['tsnmf_model'].themes
    
    prediction_scores = []
    for tsnmf_context in tsnmf_context_list:
        temp_pred_scores = []
        for ind, doc_wth in enumerate(tsnmf_context['W_test_high']):
            temp_pred_score = []
            for theme in tsnmf_context['tsnmf_model'].test_data.iloc[ind]['theme']:
                theme_id = themes.index(theme)
                temp_pred_score.append(np.log(len(themes))-np.log(np.argwhere(doc_wth.argsort()[::-1]==theme_id)[0][0] + 1))
            temp_pred_scores.append(temp_pred_score)

        prediction_scores.append(np.array([sum(tps) for tps in temp_pred_scores]))

    return np.array(prediction_scores)

In [None]:
all_models = []
for idx, pickle_name in enumerate(os.listdir("all_pickles/pickles_"+dataset_name)):
    print(idx, pickle_name)
    pickle_in = open("all_pickles/pickles_"+dataset_name+"/"+pickle_name,"rb")
    tsnmf_context_list = pickle.load(pickle_in)
    pickle_in.close()
    
#     if tsnmf_context_list[0]['tsnmf_model'].bCool_init  and tsnmf_context_list[0]['tsnmf_model'].supervision == 'supervised' and tsnmf_context_list[0]['tsnmf_model'].separate_models:
    prediction_scores = calculate_test_score(tsnmf_context_list)
    all_models.append(tsnmf_context_list[np.array([np.sum(ps) for ps in prediction_scores]).argmax()])

In [12]:
dataset_name = 'schwartz_terms'

In [13]:
all_models = []
for idx, pickle_name in enumerate(os.listdir("all_pickles/pickles_"+dataset_name)):
    print(idx, pickle_name)
    pickle_in = open("all_pickles/pickles_"+dataset_name+"/"+pickle_name,"rb")
    tsnmf_context_list = pickle.load(pickle_in)
    pickle_in.close()
    all_models.append(tsnmf_context_list)

0 schwartz_nmf_kl_tf_100.pickle
1 schwartz_topic1_supervised_separate_bCool_kullback-leibler_tf_100.pickle
2 schwartz_topic_supervised_separate_bCool_kullback-leibler_tf_100.pickle


In [14]:
# random_states = []
# for i in range(27, 36):
#     tsnmf_context_list = all_models[i]
#     prediction_scores = calculate_test_score(tsnmf_context_list)
#     random_states.append(np.array([np.sum(ps) for ps in prediction_scores]).argmax())

In [15]:
# all_models_terms = []
# for idx, model in enumerate(all_models):
#     all_models_terms.append(model[random_states[idx%9]])
all_models_terms = [am[0] for am in all_models]

In [16]:
def _row_norm(dists):
    # row normalization function required
    # for doc_topic_dists and topic_term_dists
    return dists / dists.sum(axis=1)[:, None]


def _direct_term_scores(W, H, X):
    '''
    W: doc_topic_dists (n_doc x n_topic)
    H: topic_term_dists (n_topic x n_term)
    H: doc_topic_dists (n_doc x n_term)
    '''
    
    direct_term_scores = []
    
    for i, W_row in enumerate(W):
        direct_term_scores.append((X[i].toarray()>0) * (H * W_row[:, None]))
    
    return np.array(direct_term_scores)

def _purity_term_scores(direct_term_scores):
    purity_term_scores = []
    
    for i in range(len(direct_term_scores)):
        purity_term_scores.append(direct_term_scores[i] * (direct_term_scores[i] / (direct_term_scores[i]+direct_term_scores[-1])))
        
    return np.nan_to_num(np.array(purity_term_scores))


def calculate_term_scores(H, purity_ratio):
    direct_term_scores = _row_norm(H)
    #direct_term_scores = H
    purity_term_scores = _purity_term_scores(direct_term_scores)
    
    term_scores = (1-purity_ratio) * direct_term_scores + purity_ratio * purity_term_scores
    
    return np.array(term_scores)


def calculate_doc_term_scores(W, H, X, purity_ratio):
    direct_term_scores = _direct_term_scores(_row_norm(W), _row_norm(H), X)
    purity_term_scores = _purity_term_scores(direct_term_scores)
    
    docs_term_scores = (1-purity_ratio) * direct_term_scores + purity_ratio * purity_term_scores
    
    return np.array(docs_term_scores)

In [17]:
def print_top_words(topic_term_dists, feature_names, n_top_words):
    for topic_idx, topic in enumerate(topic_term_dists):
        print("Topic #%d:" % topic_idx)
        print(" - ".join([feature_names[i] + '(' + '%.4f'%topic_term_dists[topic_idx][i] + ')' for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    
def return_top_words(topic_term_dists, feature_names, n_top_words):
    terms_list = []
    for topic_idx, topic in enumerate(topic_term_dists):
        terms_list.append([feature_names[i] + '(' + '%.4f'%topic_term_dists[topic_idx][i] + ')' for i in topic.argsort()[:-n_top_words - 1:-1]])
    return terms_list

In [18]:
corpus = list(all_models_terms[0]['data'].text)
term_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,3), max_features=10000)
tf = term_vectorizer.fit_transform(corpus)
themes = all_models_terms[1]['tsnmf_model'].themes

In [19]:
def get_info(model):
    if 'tsnmf_model' in model:
        info_dict = {
            'dataset': dataset_name,
            'type': 'TSNMF',
            'train_split': model['tsnmf_model'].train_test_split[0],
            'supervision': model['tsnmf_model'].supervision,
            'separate_models': 'separate' if model['tsnmf_model'].separate_models else 'combined',
            'subtopic_count': model['W_list'][0].shape[1]-1,
            'theme_count': len(model['tsnmf_model'].themes),
            'train_set_size': len(model['tsnmf_model'].train_data),
            'test_set_size': len(model['tsnmf_model'].test_data),
            'bCool_init': 'bCool' if model['tsnmf_model'].bCool_init else 'random',
            'beta_loss': model['tsnmf_model'].beta_loss,
            'term_vectorizer': model['tsnmf_model'].term_vectorizer,
            'solution': '-'
        }
    else:
        model['tf'] = tf
        model['tf_vectorizer'] = term_vectorizer
        info_dict = {
            'dataset': dataset_name,
            'type': model['type'],
            'train_split': model['train_perc'],
            'supervision': '-',
            'separate_models': '-',
            'subtopic_count': 1,
            'theme_count': model['W'].shape[1],
            'train_set_size': len(model['data']),
            'test_set_size': len(model['data'][model['data']['labeled'] == 0]),
            'bCool_init': '-',
            'beta_loss': model['beta_loss'],
            'term_vectorizer': model['term_vectorizer'],
            'solution': model['solution'] 
        }
    return info_dict

In [20]:
def get_term_scores(model, info_dict, purity_ratio, theme_id=0):
    if 'tsnmf_model' in model:
        if info_dict['separate_models'] == 'combined':
            term_scores = calculate_term_scores(model['tsnmf'].components_, purity_ratio=purity_ratio)
            doc_term_scores = calculate_doc_term_scores(model['W'], model['tsnmf'].components_, model['tf'], purity_ratio=purity_ratio)
        else:
            term_scores = calculate_term_scores(model['tsnmf_list'][theme_id].components_, purity_ratio=purity_ratio)
            doc_term_scores = calculate_doc_term_scores(model['W_list'][theme_id], model['tsnmf_list'][theme_id].components_, model['tf'], purity_ratio=purity_ratio)
    else:
        term_scores = calculate_term_scores(model['model'].components_, purity_ratio=0)
        doc_term_scores = calculate_doc_term_scores(model['W'], model['model'].components_, model['tf'], purity_ratio=0)
        
    return term_scores, doc_term_scores

In [83]:
purity_ratio = 0
with pd.ExcelWriter('schwartz_terms_purity'+str(purity_ratio)+'.xlsx') as writer:
    term_count = 20
    for model in all_models_terms:
        result_df = pd.DataFrame()
        info_dict = get_info(model)
        if 'tsnmf_model' in model and info_dict['separate_models'] == 'separate':
            for theme_id, theme in enumerate(themes):
                term_scores, _ = get_term_scores(model, info_dict, purity_ratio, theme_id)

                terms_list = return_top_words(term_scores, model['tf_vectorizer'].get_feature_names(), term_count)
                terms_list_sum = return_top_words(np.sum(term_scores[:-1],0)[None, :], model['tf_vectorizer'].get_feature_names(), term_count)[0]
                terms_list_max = return_top_words(np.max(term_scores[:-1],0)[None, :], model['tf_vectorizer'].get_feature_names(), term_count)[0]

                for i, terms in enumerate(terms_list):
                    result_df.insert(len(result_df.columns), theme+'_'+str(i+1), terms)
                result_df.insert(len(result_df.columns), theme+'_sum', terms_list_sum)
                result_df.insert(len(result_df.columns), theme+'_max', terms_list_max)

        else:
            term_scores, _ = get_term_scores(model, info_dict, 0)
            terms_list = return_top_words(term_scores, model['tf_vectorizer'].get_feature_names(), term_count)
            for subtopic_id, terms in enumerate(terms_list):
                result_df.insert(len(result_df.columns), themes[subtopic_id], terms)
        sheet_name = info_dict['type'] + '_' + str(info_dict['supervision']) + '_' + str(info_dict['separate_models']) + '_' + str(info_dict['train_split']) + '_' + str(info_dict['subtopic_count'])
        result_df.to_excel(writer, sheet_name=sheet_name, index=False)



In [46]:
term_count = 10
model = all_models_terms[0]
result_df = pd.DataFrame()
info_dict = get_info(model)

if 'tsnmf_model' in model and info_dict['separate_models'] == 'separate':
    for theme_id, theme in enumerate(themes):
        term_scores, _ = get_term_scores(model, info_dict, purity_ratio, theme_id)

        terms_list = return_top_words(term_scores, model['tf_vectorizer'].get_feature_names(), term_count)
        terms_list_sum = return_top_words(np.sum(term_scores[:-1],0)[None, :], model['tf_vectorizer'].get_feature_names(), term_count)[0]
        terms_list_max = return_top_words(np.max(term_scores[:-1],0)[None, :], model['tf_vectorizer'].get_feature_names(), term_count)[0]

        for i, terms in enumerate(terms_list):
            result_df.insert(len(result_df.columns), theme+'_'+str(i+1), terms)
        result_df.insert(len(result_df.columns), theme+'_sum', terms_list_sum)
        result_df.insert(len(result_df.columns), theme+'_max', terms_list_max)

else:
    term_scores, _ = get_term_scores(model, info_dict, 0)
    terms_list = return_top_words(term_scores, model['tf_vectorizer'].get_feature_names(), term_count)
    for subtopic_id, terms in enumerate(terms_list):
        result_df.insert(len(result_df.columns), themes[model['solution'].index(subtopic_id)], terms)





In [47]:
result_df

Unnamed: 0,hedonism,self-direction,power,stimulation,security,conformity,benevolence,universalism,achievement,tradition
0,may(0.0074),marriage(0.0335),miss(0.0729),social(0.0307),environmental(0.0239),god(0.0149),theory(0.0182),peace(0.0255),work(0.0160),also(0.0083)
1,social(0.0074),samesex(0.0283),little(0.0706),party(0.0163),resource(0.0107),good(0.0115),creativity(0.0173),right(0.0212),status(0.0095),one(0.0079)
2,one(0.0071),state(0.0204),little miss(0.0670),political(0.0125),environment(0.0093),moral(0.0106),think(0.0173),security(0.0179),time(0.0081),world(0.0077)
3,people(0.0067),samesex marriage(0.0183),shyness(0.0125),socialist(0.0121),ecology(0.0089),one(0.0101),capital(0.0141),international(0.0142),energy(0.0081),use(0.0058)
4,individual(0.0065),law(0.0155),series(0.0114),socialism(0.0120),human(0.0082),law(0.0092),process(0.0125),state(0.0141),use(0.0076),include(0.0058)
5,also(0.0060),court(0.0128),book(0.0110),economic(0.0113),development(0.0082),virtue(0.0078),idea(0.0103),war(0.0138),income(0.0073),love(0.0056)
6,use(0.0051),yes(0.0116),shy(0.0106),society(0.0109),specie(0.0081),ethic(0.0078),truth(0.0094),human(0.0133),hour(0.0069),new(0.0055)
7,study(0.0048),independence(0.0110),title(0.0095),labour(0.0106),natural(0.0081),human(0.0059),knowledge(0.0093),unite(0.0114),system(0.0064),tradition(0.0051)
8,need(0.0046),union(0.0110),madame(0.0087),state(0.0104),use(0.0072),philosophy(0.0059),intelligence(0.0090),woman(0.0100),orgasm(0.0058),day(0.0047)
9,group(0.0045),unite(0.0105),french(0.0087),anarchist(0.0097),water(0.0065),justice(0.0055),reason(0.0085),nation(0.0099),class(0.0057),century(0.0047)
