In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import string
import nltk
import sklearn
import mglearn as mglearn



In [2]:
train = pd.read_csv('train2electricboogaloo.csv')

train.Summary.fillna('', inplace=True)
train.Text.fillna('', inplace=True)
train.SumTxt.fillna('', inplace=True)

In [3]:
low_train = train.loc[train['Score'] == 1]
high_train = train.loc[train['Score'] == 5]

In [4]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
wnl = WordNetLemmatizer()

In [6]:
def text_process(mess):
    nopunc = mess.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words("english"))
    nopunc2 = [word for word in nopunc.split() if word.lower() not in stop_words]
    return [wnl.lemmatize(word) for word in nopunc2]

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vect = CountVectorizer(max_features=10000, max_df=.15, analyzer=text_process)
X = vect.fit_transform(low_train['SumTxt'])

In [9]:
from sklearn.decomposition import LatentDirichletAllocation

In [10]:
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                max_iter=25, random_state=0, n_jobs = 6)

In [11]:
document_topics = lda.fit_transform(X)

In [12]:
print("lda.components_.shape: {}".format(lda.components_.shape))

lda.components_.shape: (10, 10000)


In [13]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

In [14]:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
coffee        Amazon        cat           eat           chip          
cup           box           food          tried         can           
tried         order         eating        bad           food          
pod           item          eat           really        bag           
bean          ordered       day           try           jerky         
machine       received      thing         dont          popcorn       
Coffee        bag           Diet          smell         salt          
Keurig        time          three         food          brand         
ground        package       got           tasted        cracker       
weak          date          ingredient    stuff         beef          


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
Chin

In [15]:
vect = CountVectorizer(max_df=.10, analyzer=text_process)
X = vect.fit_transform(low_train['SumTxt'])

In [16]:
document_topics = lda.fit_transform(X)

In [17]:
print("lda.components_.shape: {}".format(lda.components_.shape))

lda.components_.shape: (10, 57534)


In [18]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

In [19]:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
order         bar           price         cat           dog           
ordered       salt          store         eating        cat           
item          soup          oz            day           treat         
received      peanut        cost          thing         eat           
package       butter        pack          Diet          China         
disappointed  cheese        per           Science       chicken       
date          fat           local         got           jerky         
return        noodle        1             little        meat          
opened        eat           2             always        chew          
got           sauce         le            three         pet           


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
cup 

In [20]:
def text_process(mess):
    nopunc = mess.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words("english"))
    nopunc2 = [word for word in nopunc.split() if word.lower() not in stop_words]
    nopunc2 = [word.lower() for word in nopunc2]
    return [wnl.lemmatize(word) for word in nopunc2]

In [23]:
vect = CountVectorizer(max_df=.10, analyzer=text_process)
X = vect.fit_transform(low_train['SumTxt'])

In [24]:
document_topics = lda.fit_transform(X)

In [25]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

In [26]:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
price         order         cat           smell         tea           
store         item          eat           water         cup           
pack          ordered       ingredient    stuff         chocolate     
water         received      chicken       ever          drink         
oz            package       meat          ive           green         
cost          date          corn          jerky         bitter        
bean          return        rice          sauce         kcups         
1             disappointed  baby          away          hot           
per           arrived       meal          tasted        brand         
2             stale         dog           bottle        weak          


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
eat 

In [27]:
from nltk.stem.snowball import SnowballStemmer

In [28]:
stemmer = SnowballStemmer("english")

In [29]:
def text_process(mess):
    nopunc = mess.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words("english"))
    nopunc2 = [word for word in nopunc.split() if word.lower() not in stop_words]
    nopunc2 = [word.lower() for word in nopunc2]
    return [stemmer.stem(word) for word in nopunc2]

In [30]:
vect = CountVectorizer(max_df=.10, analyzer=text_process)
X = vect.fit_transform(low_train['SumTxt'])

In [31]:
document_topics = lda.fit_transform(X)

In [32]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

In [33]:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
dog           day           chocol        dog           price         
china         thing         cup           cat           store         
treat         got           kcup          treat         cost          
pet           cat           cooki         chicken       oz            
caus          littl         roast         formula       per           
bone          three         pod           well          pay           
chicken       whatev        bean          feed          meat          
compani       feed          hot           old           pack          
warn          alway         dark          chang         local         
problem       real          brew          diet          expens        


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
tea 