# Thesis

In [1]:
import pandas as pd
import numpy as np
from time import time
import random

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA

import matplotlib.pylab as plt

import pickle

import libs.text_preprocess as tp
import libs.genetic_algorithm as ga
from libs.TSNMF_Class import TSNMF

# https://github.com/bmabey/pyLDAvis/blob/master/pyLDAvis/_prepare.py
import pyLDAvis.gensim
import pyLDAvis.sklearn
import pyLDAvis

import os

pyLDAvis.enable_notebook()



In [2]:
from nltk.corpus import reuters
from nltk.corpus import brown

## Read Data

Run one of the cells below

* index
* theme
* text

### Schwart Dataset

In [3]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data['theme'] = data['theme'].apply(lambda x: [x])
    data = data.sort_values('theme')
    data = data[['title', 'theme', 'text']]
    data = data.rename({'title': 'id'}, axis=1)
    
    return data.reset_index(drop=True)

#https://github.com/bulentozel/OpenMaker/blob/master/Semantics/data/corpuses/schwartz.json
# schwartz.json or pruned_schwartz.json
filepath = 'pruned_schwartz.json'

data = read_data(filepath)

### Reuters Dataset

In [100]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(' '.join(reuters.words(file)))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
data = pd.DataFrame({'id':fileids, 'theme':categories, 'text':text}).sort_values('theme').reset_index(drop=True)

### Brown Dataset

In [14]:
# Extract fileids from the reuters corpus
fileids = brown.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(brown.categories(file))
    text.append(' '.join(brown.words(file)))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
data = pd.DataFrame({'id':fileids, 'theme':categories, 'text':text}).sort_values('theme').reset_index(drop=True)

## Clean Texts
1. Fix bad wording: isn't -> is not
2. Clean and Tokenize -> min word len = 3, tokenize
3. Stopword Removal -> nltk.stopwords
4. Lemmatization -> WordNet Lemmatizer

In [15]:
data['text'] = data.text.apply(tp.clean_text)

In [16]:
data.head()

Unnamed: 0,id,theme,text
0,cn05,[adventure],carry quirt start raise let fall dangle wrist ...
1,cn17,[adventure],guide divan turn face sit quietly star wide ey...
2,cn16,[adventure],rattle fender hum tire chatter gear charm melo...
3,cn14,[adventure],large dutch spring mine supply town appearance...
4,cn13,[adventure],shoulder could see max loose grin burnside glo...


In [17]:
dataX = data[data['text'].apply(lambda x: x.count(' ') > 25)].copy(deep=True)

In [18]:
len(data), len(dataX), len(sorted(list(set(dataX['theme'].sum()))))

(500, 500, 15)

In [19]:
labels = {}
for i, row in dataX.iterrows():
    for t in row['theme']:
        if t not in labels:
            labels[t] = 1
        else:
            labels[t] += 1
            
#labels

In [20]:
data = data[data['text'].apply(lambda x: x.count(' ') > 25)].reset_index(drop=True).copy(deep=True)

## Analysis Tables

### NMF-LDA

In [32]:
import libs.genetic_algorithm as ga

In [33]:
result_dict_keys = ['type', 'dataset','batch_size','theme_count','train_set_size','test_set_size','train_split', 'term_vectorizer','max_score_one_theme',
                    'max_score','prediction_score','prediction_score_perc','prediction_score_mean', 'prediction_score_std']

In [34]:
result_df = pd.DataFrame(columns=result_dict_keys)
for idx, pickle_name in enumerate(os.listdir("all_pickles/pickles_reuters_nmflda")):
    print(idx, pickle_name)
    pickle_in = open("all_pickles/pickles_reuters_nmflda/"+pickle_name,"rb")
    nmflda_context_list = pickle.load(pickle_in)
    pickle_in.close()
    
    themes = sorted(list(set(nmflda_context_list[0]['data']['theme'].sum())))
    n_themes = len(themes)
    result_dict = {
        'type': nmflda_context_list[0]['type'],
        'dataset': pickle_name[:pickle_name.find('_')],
        'batch_size': len(nmflda_context_list),
        'theme_count': n_themes,
        'train_set_size': len(nmflda_context_list[0]['data'][nmflda_context_list[0]['data'].labeled == 1]),
        'test_set_size': len(nmflda_context_list[0]['data'][nmflda_context_list[0]['data'].labeled == 0]),
        'train_split': nmflda_context_list[0]['train_perc'],
        'term_vectorizer': nmflda_context_list[0]['term_vectorizer'],
        'max_score_one_theme': np.log(n_themes),
        'max_score': 0,
        'prediction_score': 0,
        'prediction_score_perc': 0,
        'prediction_score_mean': 0,
        'prediction_score_std': 0
    }

    prediction_scores = []
    for nmflda_context in nmflda_context_list:
        temp_prediction_scores, _ = ga.calculateTestScore(nmflda_context['solution'], nmflda_context['data'][nmflda_context['data'].labeled == 0], nmflda_context['W'], themes)
        prediction_scores.append(temp_prediction_scores)
    prediction_scores = np.array(prediction_scores)

    for i in range(len(prediction_scores)):
        max_score = sum([sum(np.log(n_themes)-np.log(range(1, len(aps)+1))) for aps in nmflda_context_list[i]['data'][nmflda_context_list[i]['data'].labeled == 0]['theme']])
        result_dict['max_score'] += max_score/len(nmflda_context_list)
        result_dict['prediction_score'] += prediction_scores[i].sum()/len(nmflda_context_list)
        result_dict['prediction_score_perc'] += 100*prediction_scores[i].sum()/max_score/len(nmflda_context_list)
        result_dict['prediction_score_mean'] += prediction_scores[i].mean()/len(nmflda_context_list)
        result_dict['prediction_score_std'] += prediction_scores[i].std()/len(nmflda_context_list)

    result_df = result_df.append(result_dict, ignore_index=True)
result_df.to_excel(pickle_name[:pickle_name.find('_')]+'_nmflda_scoring.xlsx')

0 reuters_lda_tf_10.pickle
1 reuters_lda_tf_20.pickle
2 reuters_lda_tf_30.pickle
3 reuters_lda_tf_40.pickle
4 reuters_lda_tf_50.pickle
5 reuters_lda_tf_60.pickle
6 reuters_lda_tf_70.pickle
7 reuters_lda_tf_80.pickle
8 reuters_lda_tf_90.pickle
9 reuters_nmf_kl_tf_10.pickle
10 reuters_nmf_kl_tf_20.pickle
11 reuters_nmf_kl_tf_30.pickle
12 reuters_nmf_kl_tf_40.pickle
13 reuters_nmf_kl_tf_50.pickle
14 reuters_nmf_kl_tf_60.pickle
15 reuters_nmf_kl_tf_70.pickle
16 reuters_nmf_kl_tf_80.pickle
17 reuters_nmf_kl_tf_90.pickle


### TSNMF Models

In [21]:
def calculate_test_score(tsnmf_context_list):
    themes = tsnmf_context_list[0]['tsnmf_model'].themes
    
    prediction_scores = []
    prediction_scores_log = []
    prediction_scores_rev_log = []
    for tsnmf_context in tsnmf_context_list:
        temp_pred_scores = []
        temp_pred_scores_log = []
        temp_pred_scores_rev_log = []
        for ind, doc_wth in enumerate(tsnmf_context['W_test_high']):
            temp_pred_score = []
            temp_pred_score_log = []
            temp_pred_score_rev_log = []
            for theme in tsnmf_context['tsnmf_model'].test_data.iloc[ind]['theme']:
                theme_id = themes.index(theme)
                temp_pred_score.append(np.argwhere(doc_wth.argsort()==theme_id)[0][0] + 1)
                temp_pred_score_log.append(np.log(np.argwhere(doc_wth.argsort()==theme_id)[0][0] + 1))
                temp_pred_score_rev_log.append(np.log(len(themes))-np.log(np.argwhere(doc_wth.argsort()[::-1]==theme_id)[0][0] + 1))
            temp_pred_scores.append(temp_pred_score)
            temp_pred_scores_log.append(temp_pred_score_log)
            temp_pred_scores_rev_log.append(temp_pred_score_rev_log)

        prediction_scores.append(np.array([sum(tps) for tps in temp_pred_scores]))
        prediction_scores_log.append(np.array([sum(tps) for tps in temp_pred_scores_log]))
        prediction_scores_rev_log.append(np.array([sum(tps) for tps in temp_pred_scores_rev_log]))

    return np.array(prediction_scores), np.array(prediction_scores_log), np.array(prediction_scores_rev_log)

In [31]:
result_dict_keys = ['dataset','batch_size','theme_count','train_set_size','test_set_size','train_split','supervision','separate_models','bCool_init',
                    'beta_loss','term_vectorizer','max_score_one_theme','max_score','prediction_score','prediction_score_perc','prediction_score_mean',
                    'prediction_score_std','max_log_score_one_theme','max_log_score','prediction_log_score','prediction_log_score_perc',
                    'prediction_log_score_mean','prediction_log_score_std']

In [32]:
result_df = pd.DataFrame(columns=result_dict_keys)
for idx, pickle_name in enumerate(os.listdir("all_pickles/pickles_schwartz_topic1")):
    print(idx, pickle_name)
    pickle_in = open("all_pickles/pickles_schwartz_topic1/"+pickle_name,"rb")
    tsnmf_context_list = pickle.load(pickle_in)
    pickle_in.close()
    
    prediction_scores, prediction_scores_log, prediction_scores_rev_log = calculate_test_score(tsnmf_context_list)
    
    n_themes = len(tsnmf_context_list[0]['tsnmf_model'].themes)

    result_dict = {
        'dataset': pickle_name[:pickle_name.find('_')],
        'batch_size': len(tsnmf_context_list),
        'theme_count': n_themes,
        'train_set_size': len(tsnmf_context_list[0]['tsnmf_model'].train_data),
        'test_set_size': len(tsnmf_context_list[0]['tsnmf_model'].test_data),
        'train_split': tsnmf_context_list[0]['tsnmf_model'].train_test_split[0],
        'supervision': tsnmf_context_list[0]['tsnmf_model'].supervision,
        'separate_models': 'separate' if tsnmf_context_list[0]['tsnmf_model'].separate_models else 'combined',
        'bCool_init': 'bCool' if tsnmf_context_list[0]['tsnmf_model'].bCool_init else 'random',
        'beta_loss': tsnmf_context_list[0]['tsnmf_model'].beta_loss,
        'term_vectorizer': tsnmf_context_list[0]['tsnmf_model'].term_vectorizer,
        'max_score_one_theme': n_themes,
        'max_score': 0,
        'prediction_score': 0,
        'prediction_score_perc': 0,
        'prediction_score_mean': 0,
        'prediction_score_std': 0,
        'max_log_score_one_theme': np.log(n_themes),
        'max_log_score': 0,
        'prediction_log_score': 0,
        'prediction_log_score_perc': 0,
        'prediction_log_score_mean': 0,
        'prediction_log_score_std': 0,
        'max_rev_log_score': 0,
        'prediction_rev_log_score': 0,
        'prediction_rev_log_score_perc': 0,
        'prediction_rev_log_score_mean': 0,
        'prediction_rev_log_score_std': 0
    }

    for i in range(len(prediction_scores)):
        max_score = sum([sum(range(n_themes-len(aps)+1, n_themes+1)) for aps in tsnmf_context_list[i]['tsnmf_model'].test_data['theme']])
        result_dict['max_score'] += max_score/len(tsnmf_context_list)
        result_dict['prediction_score'] += prediction_scores[i].sum()/len(tsnmf_context_list)
        result_dict['prediction_score_perc'] += 100*prediction_scores[i].sum()/max_score/len(tsnmf_context_list)
        result_dict['prediction_score_mean'] += prediction_scores[i].mean()/len(tsnmf_context_list)
        result_dict['prediction_score_std'] += prediction_scores[i].std()/len(tsnmf_context_list)

        max_log_score = sum([sum(np.log(range(n_themes-len(aps)+1, n_themes+1))) for aps in tsnmf_context_list[i]['tsnmf_model'].test_data['theme']])
        result_dict['max_log_score'] += max_log_score/len(tsnmf_context_list)
        result_dict['prediction_log_score'] += prediction_scores_log[i].sum()/len(tsnmf_context_list)
        result_dict['prediction_log_score_perc'] += 100*prediction_scores_log[i].sum()/max_log_score/len(tsnmf_context_list)
        result_dict['prediction_log_score_mean'] += prediction_scores_log[i].mean()/len(tsnmf_context_list)
        result_dict['prediction_log_score_std'] += prediction_scores_log[i].std()/len(tsnmf_context_list)
        
        max_rev_log_score = sum([sum(np.log(n_themes)-np.log(range(1, len(aps)+1))) for aps in tsnmf_context_list[i]['tsnmf_model'].test_data['theme']])
        result_dict['max_rev_log_score'] += max_rev_log_score/len(tsnmf_context_list)
        result_dict['prediction_rev_log_score'] += prediction_scores_rev_log[i].sum()/len(tsnmf_context_list)
        result_dict['prediction_rev_log_score_perc'] += 100*prediction_scores_rev_log[i].sum()/max_rev_log_score/len(tsnmf_context_list)
        result_dict['prediction_rev_log_score_mean'] += prediction_scores_rev_log[i].mean()/len(tsnmf_context_list)
        result_dict['prediction_rev_log_score_std'] += prediction_scores_rev_log[i].std()/len(tsnmf_context_list)
        
    result_df = result_df.append(result_dict, ignore_index=True)
result_df.to_excel(pickle_name[:pickle_name.find('_')]+'_scoring.xlsx')

0 schwartz_topic1_semi_supervised_combined_bCool_kullback-leibler_tf_10.pickle
1 schwartz_topic1_semi_supervised_combined_bCool_kullback-leibler_tf_20.pickle
2 schwartz_topic1_semi_supervised_combined_bCool_kullback-leibler_tf_30.pickle
3 schwartz_topic1_semi_supervised_combined_bCool_kullback-leibler_tf_40.pickle
4 schwartz_topic1_semi_supervised_combined_bCool_kullback-leibler_tf_50.pickle
5 schwartz_topic1_semi_supervised_combined_bCool_kullback-leibler_tf_60.pickle
6 schwartz_topic1_semi_supervised_combined_bCool_kullback-leibler_tf_70.pickle
7 schwartz_topic1_semi_supervised_combined_bCool_kullback-leibler_tf_80.pickle
8 schwartz_topic1_semi_supervised_combined_bCool_kullback-leibler_tf_90.pickle
9 schwartz_topic1_semi_supervised_combined_random_kullback-leibler_tf_10.pickle
10 schwartz_topic1_semi_supervised_combined_random_kullback-leibler_tf_20.pickle
11 schwartz_topic1_semi_supervised_combined_random_kullback-leibler_tf_30.pickle
12 schwartz_topic1_semi_supervised_combined_ran