In [1]:
import numpy as np
import json
import glob
import pandas

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Spacy
import spacy
from nltk.corpus import stopwords

# Vis
import pyLDAvis
import pyLDAvis.gensim_models
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data);

def write_data(file, data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4);


In [3]:
# Stop words are basically words that are not really useful in the topic model
# i.e. words that you want to eliminate from consideration
stopwords = stopwords.words("english")
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
data = load_data("ushmm_dn.json")["texts"]
print(data[0][0:90]) # Prints out first text, char 0-89

 My name David Kochalski. I was born in a small town called , and I was born May 5, 1928. 


In [6]:
# Reducing words to their root form (lemma) using spacy or nltk to grab roots
# Specify allowed_postags for spacy

# Takes a list of texts i.e. [text1, text2...] and returns lemmatised [text1..]
# Allowed_post_tages -> Basically default arguments in python 
# This is a list initalised that is used to filter tokens 
# pos_ tag marks category of word token belongs to i.e. only append nouns,adj,verbs,and adverbs 
# e.g. conjunctions (but, and, because), and pronouns (he/she) is ignored
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):

    # Spacy load creates an an nlp parser i.e. function pointer 
    spacy.load
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []

    for text in texts:
        doc = nlp(text) # Create document from the raw text/string
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags and token.pos_ not in stopwords:
               new_text.append(token.lemma_) # Appends only lemma to new_text
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out);

lemmatized_texts = lemmatization(data)
print(lemmatized_texts[0][0:90])

name bear small town call bear very hard work child father mother small mill flour buckwhe


In [7]:
# Simple preprocess converts documents to list of lower case tokens ignoring ones
# which are too short or too long.
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True);
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print(data_words[0][0:20])


['name', 'bear', 'small', 'town', 'call', 'bear', 'very', 'hard', 'work', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat', 'prosperous', 'comfortable', 'go', 'school']


In [8]:
# BIGRAMS AND TRIGRAMS

# min_count - ignore all words/bigrams with total collected count lower than this value.
# threshold - score for forming phrases play around with this -> related to scoring function
bigrams_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=10) 

# First arg checks ensures no overlap between bigraph phrases and trigram phrases.
trigram_phrases = gensim.models.Phrases(bigrams_phrases[data_words], threshold=10)

# Create function pointer i.e. phraser that converts document to list of bigram
bigram = gensim.models.phrases.Phraser(bigrams_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

# print(data_bigrams_trigrams[0])

new_words = []
for word in data_bigrams_trigrams[0]:
    if '_' in word:
        new_words.append(word)
print(new_words)


['name_bear', 'small_town', 'very_hard', 'public_school', 'morning_afternoon', 'late_night', 'small_town', 'small_town', 'other_word', 'non_jewish', 'very_much', 'public_school', 'more_less', 'other_word', 'other_word', 'other_word', 'polish_government', 'soon_later', 'few_day', 'about_week', 'very_anxious', 'other_word', 'very_friendly', 'so_kind_of', 'more_less', 'german_occupation', 'concentration_camp_so', 'little_town', 'little_town', 'young_man', 'sudden_hear', 'very_anxious', 'tough_tough', 'very_heavy', 'kind_of', 'other_word', 'walk_street', 'kind_of', 'turn_around', 'very_hard', 'very_hard', 'brother_sister', 'all_over', 'year_old', 'little_boy', 'dig_ditch', 'able_bodied', 'stand_line', 'all_over', 'funny_thing', 'jewish_holiday', 'no_matter', 'down_there', 'water_drink', 'very_dangerous', 'run_away', 'jewish_quarter', 'very_much', 'very_much', 'make_difference', 'stand_line', 'later_on', 'as_soon', 'middle_night', 'german_soldier', 'little_bit', 'few_day_later', 'year_later

In [9]:
# TF-IDF Removal - to remove words that are not important to the topic
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

# Goes through words and looks at words that occur too frequentely and removes them
# Corpus becomes new corpus where those irrelevant words are removed
low_value = 0.03
words = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow




[(0, 1), (1, 1), (2, 5), (3, 2), (4, 2), (5, 1), (6, 1), (7, 2), (8, 1), (9, 2), (10, 3), (11, 12), (12, 8), (13, 1), (14, 1), (15, 1), (16, 3), (17, 2), (18, 1), (19, 2)]


In [12]:
# id2word = corpora.Dictionary(data_words)
# corpus = []
# for text in data_words:
#     new = id2word.doc2bow(text) # Create a bag of words all (word: f pairs) for 1 doc
#     corpus.append(new)

# print(corpus[0][0:20])
# #(0,2) - 0 = index of word in the dictionary, 2 = frequency of word in that text

# word = id2word[[0][:1][0]] # i.e. text 1 -> first tuple -> first value in tuple
# print(word) # i.e. able is the first element in teh dictionary 

[(0, 2), (1, 10), (2, 1), (3, 2), (4, 1), (5, 1), (6, 2), (7, 3), (8, 1), (9, 12), (10, 1), (11, 8), (12, 1), (13, 2), (14, 1), (15, 3), (16, 2), (17, 1), (18, 2), (19, 1)]
able


In [10]:
lda_model = gensim.models.LdaModel(corpus=corpus
                                    ,id2word=id2word
                                    ,num_topics=5
                                    ,random_state=100
                                    ,update_every=1
                                    ,chunksize=100
                                    ,passes=10
                                    ,alpha="auto")

### Idea
I think I can separate the review data set into one star vs 5 star reviews and look at if there are any differences in the topics that are mentioned.

In [12]:
# Visualising the data
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [None]:
# Save the model into memory
lda_model.save("models/tutorial_mode.model");