In [1]:
import numpy as np
import pandas as pd
import string

from nltk.corpus import stopwords
from collections import defaultdict
from string import digits

In [2]:
catDtype = pd.CategoricalDtype(['1','2','3','4','5'], ordered= True)
train = pd.read_csv("train2electricboogaloo.csv", dtype={'Score':catDtype})

In [3]:
train.Score.value_counts()
df_class_5 = train[train['Score'] == "1"]
df_class_5.Score.value_counts()

1    39193
5        0
4        0
3        0
2        0
Name: Score, dtype: int64

In [4]:
reviews = df_class_5["Summary"] + " " + df_class_5["Text"]
scores = df_class_5["Score"]

In [5]:
#Replace na from the review column
reviews.fillna('', inplace=True)

In [6]:
#Replace html tags
from bs4 import BeautifulSoup
reviews = [BeautifulSoup(text).get_text() for text in reviews]

In [7]:
#Remove digits, punctiuation, and special character. Make it all lower case
remove_digits = str.maketrans('', '', digits) 
remove_special_char = str.maketrans('', '', '@#%$/*')
stop_words = set(stopwords.words("english"))


def remove_stop_words(reviews):
    nostop=[]
    for review in reviews:
        words = review.split()
        sent = ''
        for word in words:
            if word.lower() not in stop_words:
                sent = sent + word + (' ')
       # print(sent)
        nostop.append(sent)
    return nostop


def basic_data_cleaning(unprocessed):
    nopunc = [word.translate(str.maketrans('', '', string.punctuation)) for word in unprocessed]
    to_lowered = [word.lower() for word in nopunc]
    nodigits = [word.translate(remove_digits).translate(remove_special_char) for word in to_lowered]
    nostop = remove_stop_words(nodigits)
    return nostop


In [8]:
#reduce the sample size 
reviews_subset = reviews[:100000]

In [9]:
clean_reviews = basic_data_cleaning(reviews_subset)

In [10]:
import mglearn as mglearn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer



In [11]:
#Config 1 - basic cleaning (Remove digits, punctiuation, and special character. Make it all lower case)
#Remove words that are in 15% of the documents - 10 topic
vect = CountVectorizer(max_df=.15)
X = vect.fit_transform(clean_reviews)

In [12]:
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                max_iter=25, random_state=0, n_jobs=-1)
# We build the model and transform the data in one step
# Computing transform takes some time,
# and we can save time by doing both at once
document_topics = lda.fit_transform(X)

In [13]:
print("lda.components_.shape: {}".format(lda.components_.shape))

lda.components_.shape: (10, 44132)


In [14]:
# for each topic (a row in the components_), sort the features (ascending).
# Invert rows with [:, ::-1] to make sorting descending
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
# get the feature names from the vectorizer:
feature_names = np.array(vect.get_feature_names())

In [15]:
# Print out the 10 topics:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
sauce         food          formula       coffee        dog           
gluten        cat           new           cup           dogs          
free          dog           baby          flavor        made          
rice          eating        time          tried         treats        
mix           eat           use           tastes        china         
bread         cats          used          water         chicken       
time          day           changed       drink         products      
noodles       got           old           bad           food          
make          back          hair          kcups         sick          
cake          little        work          bitter        pet           


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
tea 

In [16]:
vect2 = CountVectorizer()
X2 = vect2.fit_transform(clean_reviews)

In [17]:
lda2 = LatentDirichletAllocation(n_components=5, learning_method="batch",
                                max_iter=25, random_state=0, n_jobs=-1)

document_topics_2 = lda2.fit_transform(X2)

In [18]:
# for each topic (a row in the components_), sort the features (ascending).
# Invert rows with [:, ::-1] to make sorting descending
sorting2 = np.argsort(lda2.components_, axis=1)[:, ::-1]
# get the feature names from the vectorizer:
feature_names2 = np.array(vect2.get_feature_names())

In [19]:
# Print out the 5 topics:
mglearn.tools.print_topics(topics=range(5), feature_names=feature_names2,
                           sorting=sorting2, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
food          coffee        product       like          ingredients   
dog           tea           amazon        taste         product       
dogs          cup           box           flavor        food          
one           food          order         good          products      
would         would         price         one           corn          
product       kcups         ordered       tastes        sugar         
made          eating        one           even          organic       
treats        green         received      dont          made          
eat           day           buy           would         ingredient    
cat           really        would         bad           natural       




In [20]:
vect3 = CountVectorizer(max_df=.25)
X3 = vect3.fit_transform(clean_reviews)

In [21]:
lda3 = LatentDirichletAllocation(n_components=5, learning_method="batch",
                                max_iter=25, random_state=0, n_jobs=-1)

document_topics3 = lda3.fit_transform(X3)

In [22]:
# for each topic (a row in the components_), sort the features (ascending).
# Invert rows with [:, ::-1] to make sorting descending
sorting3 = np.argsort(lda3.components_, axis=1)[:, ::-1]
# get the feature names from the vectorizer:
feature_names3 = np.array(vect3.get_feature_names())

In [23]:
# Print out the 10 topics:
mglearn.tools.print_topics(topics=range(5), feature_names=feature_names3,
                           sorting=sorting3, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
food          dog           amazon        flavor        coffee        
cat           dogs          box           good          tea           
ingredients   made          order         sugar         flavor        
cats          food          one           would         one           
eat           one           price         dont          good          
would         treats        ordered       one           water         
eating        would         buy           chips         even          
diet          china         received      chocolate     tried         
chicken       time          would         eat           tastes        
corn          bag           item          really        would         


