## Load raw dataset

In [258]:
import pandas as pd

data = pd.read_csv('data/olist_order_reviews_dataset.csv')

## Get only focus data

In [259]:
data = data[data['review_comment_message'].notnull()].reset_index(drop=True)

In [260]:
data.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
1,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53
2,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,recomendo,aparelho eficiente. no site a marca do aparelh...,2018-05-22 00:00:00,2018-05-23 16:45:47
3,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,,"Mas um pouco ,travando...pelo valor ta Boa.\r\n",2018-02-16 00:00:00,2018-02-20 10:52:22
4,3948b09f7c818e2d86c9a546758b2335,e51478e7e277a83743b6f9991dbfa3fb,5,Super recomendo,"Vendedor confiável, produto ok e entrega antes...",2018-05-23 00:00:00,2018-05-24 03:00:01


## Text Normalization

In [261]:
import re
import unicodedata

In [262]:
def remove_emoji(sentence):
    
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', sentence)

def remove_url(sentence):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', sentence)

def strip_accents(sentence):
    try:
        sentence = unicode(sentence, 'utf-8')
    except:
        pass
    
    sentence = unicodedata.normalize('NFD', sentence).encode('ascii', 'ignore').decode('utf-8')
    return str(sentence)

In [263]:
data['review_comment_message'] = data['review_comment_message'].str.lower()

In [264]:
data['review_comment_message'] = list(map(remove_emoji, data['review_comment_message']))

In [265]:
data['review_comment_message'] = list(map(remove_url, data['review_comment_message']))

In [266]:
data['review_comment_message'] = data['review_comment_message'].apply(lambda x: strip_accents(x))

In [267]:
data['review_comment_message'] = data['review_comment_message'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]', ' ', x)).str.strip()

## Analysis Data

In [268]:
data_text = data[['review_id', 'review_comment_message']]
data_text.columns = ['review_id', 'headline_text']
data_text.reset_index(inplace=True, drop=True)
data_text['index'] = data_text.index

documents = data_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index'] = data_text.index


## Data Preprocessing

In [269]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

In [270]:
from nltk.stem import WordNetLemmatizer, SnowballStemmer, RSLPStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [271]:
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('rslp')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\thale\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thale\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\thale\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [272]:
import spacy
nlp = spacy.load("pt_core_news_lg")

In [273]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thale\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [274]:
def lemmatize_stemming(text):
    tokens = ''    
    
    doc = nlp(text)

    for token in doc:
        
        if token.pos_ == 'NOUN' or token.pos_ == 'VERB':
            tokens = tokens.join('').join(token.lemma_)
        else:
            tokens = tokens.join('').join(token.text)
    
    return tokens

In [275]:
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:

            result.append(lemmatize_stemming(token))
              
    return result

In [276]:
document_num = 4310
doc_sample = documents[documents['index'] == document_num].values[0][1]

print("Original: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized: ")
print(preprocess(doc_sample))

Original: 
['entrega', 'super', 'rapida', 'e', 'em', 'otimas', 'condicoes']


Tokenized and lemmatized: 
['entregar', 'super', 'rapida', 'otimas', 'condicoes']


In [278]:
documents_bkp = documents.copy()

In [279]:
documents = documents_bkp[:10000]

In [280]:
processed_docs = documents['headline_text'].map(preprocess)

In [281]:
processed_docs[:10]

0                   [receber, antes, prazo, estipular]
1    [parabens, loja, lannister, adorar, comprar, p...
2    [aparelhar, eficiente, site, marcar, aparelhar...
3                         [pouco, travar, pelo, valor]
4    [vendedor, confiavel, produto, entregar, antes...
5    [gostar, saber, haver, sempre, receber, essa, ...
6                                            [pessimo]
7                                         [loja, noto]
8            [obrigar, pela, atencao, amim, dispensar]
9    [comprar, realizar, facilmente, entregar, efet...
Name: headline_text, dtype: object

## Bag of words on the dataset

Now let's create a dictionary from 'processed_docs' containing the number of times a word appears in the training set. To do that, let's pass `processed_docs` to [`gensim.corpora.Dictionary()`](https://radimrehurek.com/gensim/corpora/dictionary.html) and call it '`dictionary`'.

In [290]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [291]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 antes
1 estipular
2 prazo
3 receber
4 adorar
5 comprar
6 feliz
7 internet
8 lannister
9 loja
10 parabens


** Gensim filter_extremes **

[`filter_extremes(no_below=5, no_above=0.5, keep_n=100000)`](https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.filter_extremes)

Filter out tokens that appear in

* less than no_below documents (absolute number) or
* more than no_above documents (fraction of total corpus size, not absolute number).
* after (1) and (2), keep only the first keep_n most frequent tokens (or keep all if None).

In [293]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
# TODO: apply dictionary.filter_extremes() with the parameters mentioned above
dictionary.filter_extremes(no_below=15, no_above=0.1)

** Gensim doc2bow **

[`doc2bow(document)`](https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2bow)

* Convert document (a list of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples. Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded). No further preprocessing is done on the words in document; apply tokenization, stemming etc. before calling this method.

In [294]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
# TODO
bow_corpus = []
for i in range(len(processed_docs)):
    bow_corpus.append(dictionary.doc2bow(processed_docs[i]))

In [295]:
bow_corpus

[[(0, 1)],
 [(1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 2),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1)],
 [(11, 3),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 2),
  (16, 2),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1)],
 [(21, 1), (22, 1), (23, 1)],
 [(24, 1), (25, 1)],
 [(26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)],
 [(32, 1)],
 [(5, 1), (33, 1)],
 [(7, 1), (34, 1), (35, 1)],
 [(36, 1), (37, 1), (38, 1), (39, 1), (40, 1)],
 [(41, 1), (42, 1), (43, 1)],
 [(28, 1)],
 [(3, 1),
  (7, 1),
  (31, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 1)],
 [(15, 1), (25, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1)],
 [],
 [],
 [(5, 1), (24, 1), (56, 1)],
 [(57, 1), (58, 1)],
 [],
 [(1, 1), (25, 1), (59, 1)],
 [(14, 1), (60, 1)],
 [(61, 1), (62, 1)],
 [(63, 2), (64, 1), (65, 1), (66, 1)],
 [],
 [(67, 1)],
 [(12, 1), (39, 1), (48, 1), (68, 1), (69, 1), (70, 1), (71, 1)],
 [(1, 1), (67, 1), (72, 1), (73, 1)],
 [(5, 1), (65, 1), (74, 1), (75, 1), (76, 1), (7

In [296]:
'''
Checking Bag of Words corpus for our sample document --> (token_id, token_count)
'''
bow_corpus[document_num]

[(56, 1), (113, 1), (372, 1), (397, 1)]

In [298]:
'''
Preview BOW for our sample preprocessed document
'''
# Here document_num is document number 4310 which we have checked in Step 2
bow_doc_4310 = bow_corpus[document_num]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 56 ("super") appears 1 time.
Word 113 ("rapida") appears 1 time.
Word 372 ("condicoes") appears 1 time.
Word 397 ("otimas") appears 1 time.


# TF-IDF on our document set

While performing TF-IDF on the corpus is not necessary for LDA implemention using the gensim model, it is recemmended. TF-IDF expects a bag-of-words (integer values) training corpus during initialization. During transformation, it will take a vector and return another vector of the same dimensionality.

*Please note: The author of Gensim dictates the standard procedure for LDA to be using the Bag of Words model.*

** TF-IDF stands for "Term Frequency, Inverse Document Frequency".**

* It is a way to score the importance of words (or "terms") in a document based on how frequently they appear across multiple documents.
* If a word appears frequently in a document, it's important. Give the word a high score. But if a word appears in many documents, it's not a unique identifier. Give the word a low score.
* Therefore, common words like "the" and "for", which appear in many documents, will be scaled down. Words that appear frequently in a single document will be scaled up.

In other words:

* TF(w) = `(Number of times term w appears in a document) / (Total number of terms in the document)`.
* IDF(w) = `log_e(Total number of documents / Number of documents with term w in it)`.

** For example **

* Consider a document containing `100` words wherein the word 'tiger' appears 3 times. 
* The term frequency (i.e., tf) for 'tiger' is then: 
    - `TF = (3 / 100) = 0.03`. 

* Now, assume we have `10 million` documents and the word 'tiger' appears in `1000` of these. Then, the inverse document frequency (i.e., idf) is calculated as:
    - `IDF = log(10,000,000 / 1,000) = 4`. 

* Thus, the Tf-idf weight is the product of these quantities: 
    - `TF-IDF = 0.03 * 4 = 0.12`.

In [299]:
bow_corpus

[[(0, 1)],
 [(1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 2),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1)],
 [(11, 3),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 2),
  (16, 2),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1)],
 [(21, 1), (22, 1), (23, 1)],
 [(24, 1), (25, 1)],
 [(26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)],
 [(32, 1)],
 [(5, 1), (33, 1)],
 [(7, 1), (34, 1), (35, 1)],
 [(36, 1), (37, 1), (38, 1), (39, 1), (40, 1)],
 [(41, 1), (42, 1), (43, 1)],
 [(28, 1)],
 [(3, 1),
  (7, 1),
  (31, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 1)],
 [(15, 1), (25, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1)],
 [],
 [],
 [(5, 1), (24, 1), (56, 1)],
 [(57, 1), (58, 1)],
 [],
 [(1, 1), (25, 1), (59, 1)],
 [(14, 1), (60, 1)],
 [(61, 1), (62, 1)],
 [(63, 2), (64, 1), (65, 1), (66, 1)],
 [],
 [(67, 1)],
 [(12, 1), (39, 1), (48, 1), (68, 1), (69, 1), (70, 1), (71, 1)],
 [(1, 1), (67, 1), (72, 1), (73, 1)],
 [(5, 1), (65, 1), (74, 1), (75, 1), (76, 1), (7

In [300]:
'''
Create tf-idf model object using models.TfidfModel on 'bow_corpus' and save it to 'tfidf'
'''
from gensim import corpora, models

# TODO
tfidf = models.TfidfModel(bow_corpus)

In [301]:
'''
Apply transformation to the entire corpus and call it 'corpus_tfidf'
'''
# TODO
corpus_tfidf = tfidf[bow_corpus]

In [302]:
'''
Preview TF-IDF scores for our first document --> --> (token_id, tfidf score)
'''
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 1.0)]


## Step 4.1: Running LDA using Bag of Words ##

We are going for 10 topics in the document corpus.

** We will be running LDA using all CPU cores to parallelize and speed up model training.**

Some of the parameters we will be tweaking are:

* **num_topics** is the number of requested latent topics to be extracted from the training corpus.
* **id2word** is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing.
* **workers** is the number of extra processes to use for parallelization. Uses all available cores by default.
* **alpha** and **eta** are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. We will let these be the default values for now(default value is `1/num_topics`)
    - Alpha is the per document topic distribution.
        * High alpha: Every document has a mixture of all topics(documents appear similar to each other).
        * Low alpha: Every document has a mixture of very few topics

    - Eta is the per topic word distribution.
        * High eta: Each topic has a mixture of most words(topics appear similar to each other).
        * Low eta: Each topic has a mixture of few words.

* ** passes ** is the number of training passes through the corpus. For  example, if the training corpus has 50,000 documents, chunksize is  10,000, passes is 2, then online training is done in 10 updates: 
    * `#1 documents 0-9,999 `
    * `#2 documents 10,000-19,999 `
    * `#3 documents 20,000-29,999 `
    * `#4 documents 30,000-39,999 `
    * `#5 documents 40,000-49,999 `
    * `#6 documents 0-9,999 `
    * `#7 documents 10,000-19,999 `
    * `#8 documents 20,000-29,999 `
    * `#9 documents 30,000-39,999 `
    * `#10 documents 40,000-49,999` 

In [303]:
# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus, 
#                                    num_topics = 10, 
#                                    id2word = dictionary,                                    
#                                    passes = 50)

# LDA multicore 
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                       num_topics=10,
                                       id2word=dictionary,
                                       passes=1)

In [304]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(topic, idx ))
    print("\n")

Topic: 0.029*"dentro" + 0.025*"qualidade" + 0.024*"mais" + 0.024*"minha" + 0.023*"gostar" + 0.020*"site" + 0.017*"ter" + 0.017*"satisfeito" + 0.016*"tudo" + 0.016*"vir" 
Words: 0


Topic: 0.044*"rapida" + 0.041*"para" + 0.020*"vir" + 0.019*"qualidade" + 0.017*"ter" + 0.016*"super" + 0.014*"embalar" + 0.014*"mesmo" + 0.011*"pôr" + 0.010*"trocar" 
Words: 1


Topic: 0.021*"estou" + 0.020*"pedir" + 0.018*"como" + 0.018*"esperar" + 0.015*"para" + 0.011*"ter" + 0.011*"dia" + 0.011*"problema" + 0.011*"agora" + 0.011*"correio" 
Words: 2


Topic: 0.026*"pedir" + 0.024*"otimo" + 0.022*"vir" + 0.020*"mais" + 0.020*"adorar" + 0.019*"para" + 0.019*"esta" + 0.017*"prever" + 0.017*"ficar" + 0.014*"gostar" 
Words: 3


Topic: 0.053*"excelente" + 0.030*"otimo" + 0.024*"super" + 0.021*"amar" + 0.016*"mais" + 0.016*"para" + 0.014*"vir" + 0.014*"loja" + 0.013*"qualidade" + 0.012*"rapida" 
Words: 4


Topic: 0.047*"loja" + 0.039*"vir" + 0.024*"pedir" + 0.021*"qualidade" + 0.018*"otimo" + 0.018*"gostar" + 0.0

### Classification of the topics ###

Using the words in each topic and their corresponding weights, what categories were you able to infer?

* 0: 
* 1: 
* 2: 
* 3: 
* 4: 
* 5: 
* 6: 
* 7:  
* 8: 
* 9: 

## Step 4.2 Running LDA using TF-IDF ##

In [305]:
'''
Define lda model using corpus_tfidf, again using gensim.models.LdaMulticore()
'''
# TODO
lda_model_tfidf = gensim.models.LdaMulticore(corpus=corpus_tfidf,
                                             num_topics=10,
                                             id2word=dictionary,
                                             workers=4,
                                             passes=1)

In [306]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.026*"loja" + 0.020*"ainda" + 0.018*"vir" + 0.018*"demorar" + 0.016*"dentro" + 0.015*"rapido" + 0.014*"defeito" + 0.014*"mais" + 0.013*"gostar" + 0.011*"ficar"


Topic: 1 Word: 0.024*"ainda" + 0.020*"otimo" + 0.012*"trocar" + 0.012*"estou" + 0.012*"qualidade" + 0.010*"rapida" + 0.010*"para" + 0.010*"fazer" + 0.010*"lannister" + 0.009*"condicoes"


Topic: 2 Word: 0.030*"qualidade" + 0.021*"excelente" + 0.021*"para" + 0.019*"estou" + 0.017*"atendimento" + 0.015*"otima" + 0.014*"vir" + 0.013*"super" + 0.013*"satisfeito" + 0.012*"site"


Topic: 3 Word: 0.052*"excelente" + 0.034*"otima" + 0.023*"otimo" + 0.022*"esperar" + 0.022*"ainda" + 0.021*"loja" + 0.012*"prever" + 0.012*"conforme" + 0.011*"atrasar" + 0.010*"correio"


Topic: 4 Word: 0.046*"otimo" + 0.023*"qualidade" + 0.021*"super" + 0.020*"ainda" + 0.018*"todos" + 0.014*"dois" + 0.014*"funcionar" + 0.014*"embalar" + 0.013*"datar" + 0.011*"para"


Topic: 5 Word: 0.029*"rapido" + 0.023*"super" + 0.021*"vir" + 0.019*"amar

### Classification of the topics ###

As we can see, when using tf-idf, heavier weights are given to words that are not as frequent which results in nouns being factored in. That makes it harder to figure out the categories as nouns can be hard to categorize. This goes to show that the models we apply depend on the type of corpus of text we are dealing with. 

Using the words in each topic and their corresponding weights, what categories could you find?

* 0: 
* 1:  
* 2: 
* 3: 
* 4:  
* 5: 
* 6: 
* 7: 
* 8: 
* 9: 

## Step 5.1: Performance evaluation by classifying sample document using LDA Bag of Words model##

We will check to see where our test document would be classified. 

In [307]:
'''
Text of sample document 4310
'''
processed_docs[4310]

['entregar', 'super', 'rapida', 'otimas', 'condicoes']

In [308]:
'''
Check which topic our test document belongs to using the LDA Bag of Words model.
'''
document_num = 4310
# Our test document is document number 4310

# TODO
# Our test document is document number 4310
for index, score in sorted(lda_model[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8199769258499146	 
Topic: 0.044*"rapida" + 0.041*"para" + 0.020*"vir" + 0.019*"qualidade" + 0.017*"ter" + 0.016*"super" + 0.014*"embalar" + 0.014*"mesmo" + 0.011*"pôr" + 0.010*"trocar"

Score: 0.020004594698548317	 
Topic: 0.053*"excelente" + 0.030*"otimo" + 0.024*"super" + 0.021*"amar" + 0.016*"mais" + 0.016*"para" + 0.014*"vir" + 0.014*"loja" + 0.013*"qualidade" + 0.012*"rapida"

Score: 0.02000417374074459	 
Topic: 0.029*"otimo" + 0.021*"otima" + 0.020*"super" + 0.020*"qualidade" + 0.019*"loja" + 0.018*"rapida" + 0.017*"lannister" + 0.016*"estou" + 0.015*"como" + 0.014*"para"

Score: 0.020002471283078194	 
Topic: 0.036*"gostar" + 0.026*"qualidade" + 0.024*"loja" + 0.023*"ainda" + 0.020*"ter" + 0.020*"para" + 0.019*"vir" + 0.017*"esta" + 0.016*"como" + 0.016*"lannister"

Score: 0.020002448931336403	 
Topic: 0.047*"loja" + 0.039*"vir" + 0.024*"pedir" + 0.021*"qualidade" + 0.018*"otimo" + 0.018*"gostar" + 0.017*"perfeito" + 0.014*"ainda" + 0.013*"poder" + 0.013*"parabens"

Sco

### It has the highest probability (`x`) to be  part of the topic that we assigned as X, which is the accurate classification. ###

## Step 5.2: Performance evaluation by classifying sample document using LDA TF-IDF model##

In [309]:
'''
Check which topic our test document belongs to using the LDA TF-IDF model.
'''
# Our test document is document number 4310
for index, score in sorted(lda_model_tfidf[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.819973349571228	 
Topic: 0.051*"rapida" + 0.048*"tudo" + 0.028*"otimo" + 0.024*"qualidade" + 0.018*"certo" + 0.016*"super" + 0.015*"anunciar" + 0.013*"preco" + 0.013*"material" + 0.013*"gostar"

Score: 0.020006712526082993	 
Topic: 0.024*"ainda" + 0.020*"otimo" + 0.012*"trocar" + 0.012*"estou" + 0.012*"qualidade" + 0.010*"rapida" + 0.010*"para" + 0.010*"fazer" + 0.010*"lannister" + 0.009*"condicoes"

Score: 0.020003410056233406	 
Topic: 0.050*"otimo" + 0.047*"adorar" + 0.025*"satisfeito" + 0.022*"parabens" + 0.016*"loja" + 0.014*"gostar" + 0.012*"pedir" + 0.010*"estou" + 0.009*"agora" + 0.009*"realizar"

Score: 0.0200030654668808	 
Topic: 0.029*"rapido" + 0.023*"super" + 0.021*"vir" + 0.019*"amar" + 0.018*"satisfeito" + 0.017*"qualidade" + 0.016*"excelente" + 0.016*"prever" + 0.016*"noto" + 0.016*"dentro"

Score: 0.0200028233230114	 
Topic: 0.030*"qualidade" + 0.021*"excelente" + 0.021*"para" + 0.019*"estou" + 0.017*"atendimento" + 0.015*"otima" + 0.014*"vir" + 0.013*"super" 

### It has the highest probability (`x%`) to be  part of the topic that we assigned as X. ###

## Step 6: Testing model on unseen document ##

In [312]:
unseen_document = "A entrega veio quebrada, não gostei da loja"

# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8199043273925781	 Topic: 0.047*"loja" + 0.039*"vir" + 0.024*"pedir" + 0.021*"qualidade" + 0.018*"otimo"
Score: 0.02001667581498623	 Topic: 0.026*"pedir" + 0.024*"otimo" + 0.022*"vir" + 0.020*"mais" + 0.020*"adorar"
Score: 0.020014632493257523	 Topic: 0.021*"estou" + 0.020*"pedir" + 0.018*"como" + 0.018*"esperar" + 0.015*"para"
Score: 0.02001287415623665	 Topic: 0.029*"otimo" + 0.021*"otima" + 0.020*"super" + 0.020*"qualidade" + 0.019*"loja"
Score: 0.02001151442527771	 Topic: 0.029*"dentro" + 0.025*"qualidade" + 0.024*"mais" + 0.024*"minha" + 0.023*"gostar"
Score: 0.020010406151413918	 Topic: 0.053*"vir" + 0.039*"tudo" + 0.021*"ainda" + 0.020*"satisfeito" + 0.018*"para"
Score: 0.020009225234389305	 Topic: 0.049*"tudo" + 0.022*"para" + 0.019*"como" + 0.018*"estou" + 0.017*"conforme"
Score: 0.02000802755355835	 Topic: 0.036*"gostar" + 0.026*"qualidade" + 0.024*"loja" + 0.023*"ainda" + 0.020*"ter"
Score: 0.020006835460662842	 Topic: 0.044*"rapida" + 0.041*"para" + 0.020*"vir" + 0.

The model correctly classifies the unseen document with 'x'% probability to the X category.