In [None]:
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 30
n_features = 50
n_components = 5
n_top_words = 4

In [None]:
%time
# dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=("headers", "footers", "quotes"))
# datasamples = dataset.data[:n_samples]


## own data from the amazon reviews

import pandas as pd

# location of the data
datafile = "../data/digitalmusic/reviews/complete_reviews.csv"

data = pd.read_csv(datafile, dtype={"asin": str, "helpful": str, "reviewText": str, "reviewerID": str})

# data.head

In [None]:
## moulding the data to fit the LDA. 
# we will not find all the reviews for the item specific B00LOMZDM8

user_id1 = "B00LOMZDM8"

# for rows
row_data = data.loc[data['reviewerID'] == user_id1]
# for columns
review_data = row_data.iloc[:,3]
## found -- reviews for this user

# datatext = ""
datatext = review_data.tolist()

#ratings on the same product
rating_data = row_data.iloc[:,2]

rating_data = rating_data.tolist()


# for review in data:
#     datatext += review+". "

print(datatext)

print(rating_data)
data.count()

In [None]:
# tfidf vectorizer 

%time
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(datatext)

In [None]:
# tf vectorizer used for LDA - topiuc Modelling

%time
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
tf = tf_vectorizer.fit_transform(datatext)

In [None]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=25, learning_method='online', learning_offset=50, random_state=0)

lda.fit(tf)

In [None]:
# using visualisation for the lda
import pyLDAvis
import pyLDAvis.sklearn

tree = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer) # there are different variations of this function.

pyLDAvis.display(tree)



In [None]:

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [None]:
tf_feature_names = tf_vectorizer.get_feature_names()

print_top_words(lda, tf_feature_names, n_top_words)