# Thesis

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from time import time
import random

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA

import matplotlib.pylab as plt

import libs.text_preprocess as tp
import libs.genetic_algorithm as ga
from libs.TSNMF_Class import TSNMF

import pickle

# https://github.com/bmabey/pyLDAvis/blob/master/pyLDAvis/_prepare.py
import pyLDAvis.gensim
import pyLDAvis.sklearn
import pyLDAvis

pyLDAvis.enable_notebook()



In [3]:
from nltk.corpus import reuters
from nltk.corpus import brown

## Read Data

Run one of the cells below

* index
* theme
* text

### Schwart Dataset

In [4]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data['theme'] = data['theme'].apply(lambda x: [x])
    data = data.sort_values('theme')
    data = data[['title', 'theme', 'text']]
    data = data.rename({'title': 'id'}, axis=1)
    
    return data.reset_index(drop=True)

#https://github.com/bulentozel/OpenMaker/blob/master/Semantics/data/corpuses/schwartz.json
# schwartz.json or pruned_schwartz.json
filepath = 'pruned_schwartz.json'

data = read_data(filepath)

### Reuters Dataset

In [19]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(' '.join(reuters.words(file)))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
data = pd.DataFrame({'id':fileids, 'theme':categories, 'text':text}).sort_values('theme').reset_index(drop=True)

### Brown Dataset

In [4]:
# Extract fileids from the reuters corpus
fileids = brown.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(brown.categories(file))
    text.append(' '.join(brown.words(file)))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
data = pd.DataFrame({'id':fileids, 'theme':categories, 'text':text}).sort_values('theme').reset_index(drop=True)

## Clean Texts
1. Fix bad wording: isn't -> is not
2. Clean and Tokenize -> min word len = 3, tokenize
3. Stopword Removal -> nltk.stopwords
4. Lemmatization -> WordNet Lemmatizer

In [5]:
data['text'] = data.text.apply(tp.clean_text)

In [6]:
data.head()

Unnamed: 0,id,theme,text
0,Ascribed status,[achievement],ascribe status part series political legal ant...
1,Relational capital,[achievement],relational capital redirect relational capital...
2,Human resources,[achievement],human resource us see human resource disambigu...
3,Intellectual capital,[achievement],intellectual capital intellectual capital inta...
4,Expenses versus Capital Expenditures,[achievement],expense versus capital expenditure redirect ca...


In [7]:
data = data[data['text'].apply(lambda x: x.count(' ') > 25)].reset_index(drop=True).copy(deep=True)

In [8]:
len(data)

433

## Model Generator

In [10]:
def get_pickle_name(tsnmf_model, dataset_name):
    pickle_name = dataset_name
    pickle_name += '_'
    pickle_name += tsnmf_model.supervision
    pickle_name += '_'
    pickle_name += 'separate' if tsnmf_model.separate_models else 'combined'
    pickle_name += '_'
    pickle_name += 'bCool' if tsnmf_model.bCool_init else 'random'
    pickle_name += '_'
    pickle_name += tsnmf_model.beta_loss
    pickle_name += '_'
    pickle_name += tsnmf_model.term_vectorizer
    pickle_name += '_'
    pickle_name += str(int(tsnmf_model.train_test_split[0]*100))
    
    return pickle_name

def save_to_pickle(tsnmf_context_list, pickle_name):
    pickle_out = open("all_pickles/pickles_schwartz_terms/"+pickle_name+".pickle","wb")
    pickle.dump(tsnmf_context_list, pickle_out)
    pickle_out.close()

In [15]:
supervision_list = ['semi_supervised', 'supervised']
separate_models_list = [False, True]
bCool_init_list = [False, True]
beta_loss_list = ['kullback-leibler']
term_vectorizer_list = ['tf']
train_test_split_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
random_state_list = [6]

In [11]:
supervision_list = ['supervised']
separate_models_list = [True]
bCool_init_list = [True]
beta_loss_list = ['kullback-leibler']
term_vectorizer_list = ['tf']
train_test_split_list = [1]
random_state_list = [1]

In [18]:
for supervision in supervision_list:
    for separate_models in separate_models_list:
        for bCool_init in bCool_init_list:
            for beta_loss in beta_loss_list:
                for term_vectorizer in term_vectorizer_list:
                    for train_perc in train_test_split_list:
                        tsnmf_context_list = []
                        for random_state in random_state_list:
                            tsnmf_model = TSNMF(data = data, supervision = supervision, separate_models = separate_models, bCool_init = bCool_init,
                                                train_test_split = [train_perc, 1-train_perc], n_topics = 3, n_terms = 10000,background_for_theme = True,
                                                background_scoring = True, beta_loss = beta_loss, term_vectorizer = term_vectorizer, random_state = random_state)
                            
                            pickle_name = get_pickle_name(tsnmf_model, 'schwartz_topic')
                            t0 = time()
                            print(pickle_name, random_state)
                            
                            if term_vectorizer == 'tfidf' and beta_loss == 'frobenius':
                                print("continue")
                                continue
                                
                            tsnmf_model = tsnmf_model.split_train_test()
                            
#                             # For Reuters
#                             if train_perc > 0.25 and train_perc < 0.75:
#                                 tsnmf_model = tsnmf_model.split_train_test()
#                             else:
#                                 tsnmf_model = tsnmf_model.split_train_test_forced()
                            
                            print("Training.. ", end='')
                            t1 = time()
                            tsnmf_context = tsnmf_model.fit()
                            print("%0.2fs - " % (time() - t1), end='')
                            
                            print("Testing.. ", end='')
                            t1 = time()
#                             tsnmf_context = tsnmf_model.evaluate_test_corpus(tsnmf_context)
                            print("%0.2fs - " % (time() - t1), end='')
                            
                            tsnmf_context['tsnmf_model'] = tsnmf_model
                            tsnmf_context_list.append(tsnmf_context)
                            print("Done:  %0.2fs" % (time() - t0))
                            
                        if not (term_vectorizer == 'tfidf' and beta_loss == 'frobenius'):
                            save_to_pickle(tsnmf_context_list, pickle_name)

schwartz_topic_supervised_separate_bCool_kullback-leibler_tf_100 1
Training.. 51.84s - Testing.. 0.00s - Done:  51.86s


## Classic NMF-LDA

* Train NMF and LDA using all the data  points then test the scores of the test data
* Other version could be using only train data to train the model and using transfor to get W matrix for test data.

In [54]:
import libs.genetic_algorithm as ga

In [55]:
themes = sorted(list(set(data['theme'].sum())))
train_test_split_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
#random_state_list = [1,2,3,4,5]
random_state_list = [5,6]

for train_perc in train_test_split_list:
    pickle_context_list = []
    #pickle_name = 'reuters_nmf_kl_tf_'+str(int(train_perc*100))
    pickle_name = 'reuters_lda_tf_'+str(int(train_perc*100))
    for random_state in random_state_list:
        print(pickle_name, random_state,  end=' - ')
        t0 = time()

        tsnmf_model = TSNMF(data = data, supervision = 'semi_supervised', train_test_split = [train_perc, 1-train_perc], random_state = random_state)
        
        if train_perc > 0.25 and train_perc < 0.75:
            tsnmf_model = tsnmf_model.split_train_test()
        else:
            tsnmf_model = tsnmf_model.split_train_test_forced()

        train_data = tsnmf_model.train_data.copy(deep=True)
        test_data = tsnmf_model.test_data.copy(deep=True)

        corpus = list(train_data.text)
        term_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,3), max_features=10000)
        #term_vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,3), max_features=10000)
        tf = term_vectorizer.fit_transform(corpus)

        #model = NMF(n_components = len(themes), solver='mu', beta_loss='kullback-leibler', alpha=.1, l1_ratio=.5)
        #W = model.fit_transform(X=tf)

        model = LDA(n_components=len(themes))
        W = model.fit_transform(tf)

        population, solution, solution_ind, solution_obj = ga.run_ga(train_data[train_data.labeled == 1], W, themes, stopGeneration=100)
        print(solution_obj, time()-t0)

        pickle_context = {
            'type': 'LDA',
            'model': model,
            'W': W,
            'solution': solution,
            'solution_obj': solution_obj,
            'data': train_data,
            'beta_loss': 'none',
            'term_vectorizer': 'tf',
            'train_perc': train_perc,
            'random_state': random_state
        }
        pickle_context_list.append(pickle_context)
        
    pickle_out = open("all_pickles/pickles_reuters_nmflda2/"+pickle_name+".pickle","wb")
    pickle.dump(pickle_context_list, pickle_out)
    pickle_out.close()

reuters_lda_tf_10 5 - 90
2022.2671098588698 274.8381748199463
reuters_lda_tf_10 6 - 90
1611.6321233406484 282.2373912334442
reuters_lda_tf_20 5 - 90
3816.056392922948 363.3355665206909
reuters_lda_tf_20 6 - 90
2868.3257246428034 373.8933324813843
reuters_lda_tf_30 5 - 4547.532682205725 452.85126280784607
reuters_lda_tf_30 6 - 4989.437844178909 459.78307580947876
reuters_lda_tf_40 5 - 7462.52380374062 564.7877328395844
reuters_lda_tf_40 6 - 5339.851872636042 551.7591743469238
reuters_lda_tf_50 5 - 7498.295282242676 632.3538200855255
reuters_lda_tf_50 6 - 7528.2774544532185 630.1147961616516
reuters_lda_tf_60 5 - 8959.569442807733 717.9123907089233
reuters_lda_tf_60 6 - 9130.884343570327 745.151282787323
reuters_lda_tf_70 5 - 12171.159545015074 832.2895488739014
reuters_lda_tf_70 6 - 10777.684544381093 846.8187634944916
reuters_lda_tf_80 5 - 90
14706.876212814364 888.6425547599792
reuters_lda_tf_80 6 - 90
12900.537466963138 876.054221868515
reuters_lda_tf_90 5 - 90
16357.570602643982 944

In [19]:
themes = sorted(list(set(data['theme'].sum())))
train_test_split_list = [1]
#random_state_list = [1,2,3,4,5]
random_state_list = [1]

for train_perc in train_test_split_list:
    pickle_context_list = []
    pickle_name = 'schwartz_nmf_kl_tf_'+str(int(train_perc*100))
#     pickle_name = 'schwartz_lda_tf_'+str(int(train_perc*100))
    for random_state in random_state_list:
        print(pickle_name, random_state,  end=' - ')
        t0 = time()

        tsnmf_model = TSNMF(data = data, supervision = 'semi_supervised', train_test_split = [train_perc, 1-train_perc], random_state = random_state)
        
        if train_perc > 0.25 and train_perc < 0.75:
            tsnmf_model = tsnmf_model.split_train_test()
        else:
            tsnmf_model = tsnmf_model.split_train_test_forced()

        train_data = tsnmf_model.train_data.copy(deep=True)
        test_data = tsnmf_model.test_data.copy(deep=True)

        corpus = list(train_data.text)
        term_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,3), max_features=10000)
        #term_vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,3), max_features=10000)
        tf = term_vectorizer.fit_transform(corpus)

        model = NMF(n_components = len(themes), solver='mu', beta_loss='kullback-leibler', alpha=.1, l1_ratio=.5)
        W = model.fit_transform(X=tf)

#         model = LDA(n_components=len(themes))
#         W = model.fit_transform(tf)

        population, solution, solution_ind, solution_obj = ga.run_ga(train_data[train_data.labeled == 1], W, themes, stopGeneration=100)
        print(solution_obj, time()-t0)

        pickle_context = {
            'type': 'NMF',
            'model': model,
            'W': W,
            'solution': solution,
            'solution_obj': solution_obj,
            'data': train_data,
            'beta_loss': 'none',
            'term_vectorizer': 'tf',
            'train_perc': train_perc,
            'random_state': random_state
        }
        pickle_context_list.append(pickle_context)
        
    pickle_out = open("all_pickles/pickles_schwartz_terms/"+pickle_name+".pickle","wb")
    pickle.dump(pickle_context_list, pickle_out)
    pickle_out.close()

schwartz_nmf_kl_tf_100 1 - 529.226235753119 43.60194253921509
