In [1]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/theo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Gensim

In [2]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.test.utils import datapath

pprint for easier logging

In [3]:
from pprint import pprint

spacy for lemmatization

In [4]:
import spacy

Enable logging for gensim

In [5]:
import logging

In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [7]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from nltk.corpus import stopwords

Import LDA model

In [8]:
file_dir = "LDA_Model/LDA.model"
lda_model = gensim.models.ldamodel.LdaModel.load(file_dir)
# Define functions for stopwords, bigrams, trigrams and lemmatization
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [9]:
def sent_to_words(sentence):
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [10]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [11]:
def make_bigrams(texts):
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in texts]

In [12]:
def make_trigrams(texts):
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)  
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [13]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    nlp = spacy.load("en_core_web_sm")
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

Format the new text/document

In [14]:
def formatUnseen(unseen_document):
    data_words = list(sent_to_words(unseen_document))
    # Remove Stop Words
    data_words_nostops = remove_stopwords(sent_to_words(data_words))
    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    return data_lemmatized

In [17]:
from operator import itemgetter
def getTopTopic(unseen_document):
    data_lemmatized = formatUnseen(unseen_document)
    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)
    # Create Corpus
    other_texts = data_lemmatized
    other_corpus = [id2word.doc2bow(text) for text in other_texts]
    unseen_doc = other_corpus[0]
    print(unseen_doc)
    vector = lda_model[unseen_doc]
# get the element that has the highest score
# this will then be the topic that fits the unseen_document the best
    index_max = max(vector, key=itemgetter(1))
    return index_max
    

In [35]:
# Selected sentneces to test by uncommenting
# unseen_document = "Beetlejuice is a movie I consider to be one of Tim Burton's best movies. I also consider it to be one of those kind of movies that could have come out only in the 80s (much like Labyrinth starring David Bowie and Jennifer Connelly).Beetlejuice deals with a recently deceased married couple, the Maitlands, who finds themselves essentially "'trapped'" in their former house for the next century or so. Unfortunately, this means living with the house's new owners, the Deetz. The Maitlands don't mind Deetz daughter Lydia played by Winona Ryder so much but her much more obnoxious parents and want to scare them away from the house. Their case worker tells them only one thing"
# unseen_document = "I was looking for some blue wax because I noticed that in European Wax Center usethat kind of wax and it is so less painfull. I really think that this product has the same effect. The kit is perfectly designed to make your waxing easier and with minimal pain as possible."
unseen_document = "I have been having eye irritation and issues for a while now. Every eye makeup remover burns and irritates my eyes. However, when I use jojoba oil, my make up comes off easily and there is no irritation. It is the best thing ever!!!! You can read the other reviews to see the other AMAZING benefits of it :)"



# proccess unseen_document and get the top topic and its words
hotTopic = getTopTopic(unseen_document)
pprint("Score: {} Topic Index: {} Topic: {}".format(hotTopic[1], hotTopic[0], lda_model.print_topic(hotTopic[0], 30)))

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]
('Score: 0.3061200678348541 Topic Index: 10 Topic: 0.049*"order" + '
 '0.035*"product" + 0.025*"package" + 0.023*"item" + 0.020*"arrive" + '
 '0.017*"receive" + 0.017*"send" + 0.015*"dryer" + 0.015*"buy" + 0.014*"good" '
 '+ 0.014*"ship" + 0.013*"find" + 0.013*"purchase" + 0.012*"company" + '
 '0.011*"get" + 0.010*"toilet" + 0.010*"packaging" + 0.010*"love" + '
 '0.009*"online" + 0.008*"compliment" + 0.008*"really" + 0.008*"fruit" + '
 '0.008*"quickly" + 0.007*"pocket" + 0.007*"candy" + 0.007*"pack" + '
 '0.007*"always" + 0.007*"please" + 0.007*"go" + 0.006*"salon"')
