In [1]:
# Import Statements
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
files = '../ASIN_Scraper/ASIN_SCRAPER/spiders/'
data = pd.read_csv(files + 'reviews.csv')

In [3]:
# Set up data
revs = np.asarray([
    'The new Amazon Review Summary tool is so awesome! I love how it tells me exactly what I need about a product',
    'The Amazon Review Summary tool honestly changed my life. It has been the biggest blesing I could have asked for',
    'I don\'t know how I\'ve lived my life without this. The Amazon Review Summary Tool gets a 5 out of 5!',
    'I HATE the Amazon Review Summary tool. It\'s absolutely horrible. Do not use!',
    'The Amazon Review Summary tool honestly makes me want to jump off a bridge. Please do NOT use!',
    'If you use the Amazon Review Summary tool, you are honestly wasting your time. It is so so bad!',
])
revs = data['reviewComment'].tolist()

In [4]:
revs_dirty = revs[0:1000]

In [5]:
revs = []
for i in revs_dirty:
    if type(i) != float:
        revs.append(i)

In [6]:
# Encode articles 

In [6]:
# Loading model 
models = ['stsb-bert-base', 'stsb-roberta-base', 'stsb-bert-large', 'stsb-roberta-large']
model = SentenceTransformer(models[1])

In [7]:
# Retrieve vectors 
vecs = model.encode(revs)

In [8]:
# Running PCA on data 
pca = PCA(n_components = 50)
components = pca.fit_transform(vecs)
print('Percent variance explained: {:.2f}'.format(sum(pca.explained_variance_ratio_)))

Percent variance explained: 0.82


In [34]:
# Visualizing PCA
plt.figure()
plt.scatter(components[:,0], components[:,1])
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA Visualization of Reviews")
plt.show()

In [32]:
%matplotlib auto
from mpl_toolkits import mplot3d
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.text2D(0.05, 0.95, "PCA Visualization of Reviews", transform=ax.transAxes)

Using matplotlib backend: MacOSX


Text(0.05, 0.95, 'PCA Visualization of Reviews')

In [33]:
ax.scatter3D(components[:,0], components[:,1], components[:,2], c=components[:,2], cmap='Greens')

In [35]:
distortions = []
K = range(1,100)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(components)
    distortions.append(kmeanModel.inertia_)

plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [36]:
# Clustering 
K = 20
clusters = KMeans(n_clusters=K, random_state=0).fit(components)

In [41]:
# Visualizing Clusters
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.text2D(0.05, 0.95, "PCA Visualization of Clustered Reviews (k=20)", transform=ax.transAxes)
for i in range(K):
    labels = clusters.labels_ == i
    ax.scatter3D(components[labels,0], components[labels,1], components[labels,2])

In [42]:
# Printing out different clusters
revs = np.asarray(revs)
for i in range(K):
    labels = clusters.labels_ == i
    print('Cluster #:', i)
    for j, review in enumerate(revs[labels]):
        print(j, review)

Cluster #: 0
0 It was a Christmas present for my daughter. She went to Barnes & Noble and traded it in for the more expensive model, don't remember the name of it. She's very happy, tho, with the new one.
1 This was a birthday gift from a friend.  I love it.  It is light, easy to use, and you can read with your light off.
2 Bought as a present.  She loves it.
3 Got it for my mom she just loves it
4 Was a gift
5 It was a gift and I love it.
6 Gift for wife and she loves it.
7 This was a gift for my grandson. He acknowledged receiving it. Iwill have to check later to see if he has read it.
8 Ordered for our granddaughter for Christmas.  She asked for it last year after seeing mine.  She started using it right away.
9 Took it on vacation and am very happy with it.
10 Another gift, won't know until Christmas before it is used
11 gift loved it
12 Wonderful product. Purchased two and both were a hit at Christmas time! Love these types of gifts and always seem to give them to others!
13 It wa

In [43]:
# Finding review closest to each center 
centers = clusters.cluster_centers_
closest = [] 
for i in range(K):
    labels = clusters.labels_ == i
    dist = float('inf')
    closest_review = None 
    for review_vec, review in zip(components[labels],revs[labels]):
        dist_to_center = np.linalg.norm(review_vec - centers[i])
        if dist_to_center < dist:
            closest_review = review
    closest.append(review)
for i, r in enumerate(closest):
    print('Cluster Rep #', i, r)

Cluster Rep # 0 It was a gift but they liked it
Cluster Rep # 1 I have had two previous generations of NOOKs before getting the NOOK Simple Touch with Glowlight (NSTGL) about three months ago.  I don't buy ebooks; I either check them out from the library or get free public domain books.  The NOOK is a great device for that.
Cluster Rep # 2 Classic . Nice collection of short stories that stand the test of time.
Cluster Rep # 3 A wonderful collection of The writer's fine tales! It really is difficult to surpass the English authors of yesteryear, especially writers of the classics.
Cluster Rep # 4 just what I wanted. you are awesome.. thank you
Cluster Rep # 5 The best birthday present I ever got.  Can't count how many books I read on this thing.
Cluster Rep # 6 I had some issues first learning it, but now I love my NOOK!! I am very happy with this purchase.
Cluster Rep # 7 For $10 more you can get the 7in HD that has a color screen.
Cluster Rep # 8 I have used Nook readers since they cam

In [45]:
import torch
import json 
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')

text = """
I can't fucking take it. I see an image of a random object posted and then I see it,
I fucking see it. "Oh that looks kinda like the among us guy" it started as. 
That's funny, that's a cool reference. But I kept going, I'd see a fridge that looked like among us, 
I'd see an animated bag of chips that looked like among us, I'd see a hat that looked like among us. 
And every time I'd burst into an insane, breath deprived laugh staring at the image as the words AMOGUS 
ran through my head. It's torment, psychological torture, I am being conditioned to laugh maniacly any 
time I see an oval on a red object. I can't fucking live like this... I can't I can't I can't I can't 
I can't! And don't get me fucking started on the words! I'll never hear the word suspicious again without 
thinking of among us. Someone does something bad and I can't say anything other than "sus." I could watch 
a man murder everyone I love and all I would be able to say is "red sus" and laugh like a fucking insane
person. And the word "among" is ruined. The phrase "among us" is ruined. I can't live anymore. Among us 
has destroyed my fucking life. I want to eject myself from this plane of existence. MAKE IT STOP!
"""
text =" ".join(closest)

preprocess_text = text.strip().replace("\n","")
t5_prepared_Text = "summarize: "+preprocess_text
print ("original text preprocessed: \n", preprocess_text)

tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)

# summmarize 
summary_ids = model.generate(tokenized_text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    min_length=0,
                                    max_length=100,
                                    early_stopping=True)

output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print ("\n\nSummarized text: \n",output)

Token indices sequence length is longer than the specified maximum sequence length for this model (1266 > 512). Running this sequence through the model will result in indexing errors


original text preprocessed: 
 It was a gift but they liked it I have had two previous generations of NOOKs before getting the NOOK Simple Touch with Glowlight (NSTGL) about three months ago.  I don't buy ebooks; I either check them out from the library or get free public domain books.  The NOOK is a great device for that. Classic . Nice collection of short stories that stand the test of time. A wonderful collection of The writer's fine tales! It really is difficult to surpass the English authors of yesteryear, especially writers of the classics. just what I wanted. you are awesome.. thank you The best birthday present I ever got.  Can't count how many books I read on this thing. I had some issues first learning it, but now I love my NOOK!! I am very happy with this purchase. For $10 more you can get the 7in HD that has a color screen. I have used Nook readers since they came out. That being said, do not buy this version. I have had 3.....all within the same year. All 3 have experienced

In [None]:
" ".join(closest)