In [1]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')


FOLDERNAME = "ENPM809K/final_project_v2/"
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd /content/drive/My\ Drive/$FOLDERNAME

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/ENPM809K/final_project_v2


In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install transformers
!pip install sentencepiece

import torch 
from torch.utils.data import DataLoader

from classifier.utils import load_data, encode_text, CustomDataset, create_custom_dataset
from web_scrapper import Scrapper
from classifier.model import ReviewClassifier
from classifier.predict import predict, product_recomendation_score

1. Webscrapping reviews

In [None]:
# webscrapping
# url = "https://www.amazon.com/Nine-West-Womens-Silver-Tone-Black/dp/B0721SGTGY/?_encoding=UTF8&pd_rd_w=6eJqF&content-id=amzn1.sym.e4bd6ac6-9035-4a04-92a6-fc4ad60e09ad&pf_rd_p=e4bd6ac6-9035-4a04-92a6-fc4ad60e09ad&pf_rd_r=TFW8DVB8ETT6J8CWEAP6&pd_rd_wg=TLy4F&pd_rd_r=2b9d59aa-aba1-431c-86e8-085e83989c2f&ref_=pd_gw_ci_mcx_mr_hp_atf_m&th=1"

# scrapper = Scrapper()
# scrapped_html = scrapper.get_all_reviews(url)
# scrapper.save_as_csv("cache/scrapped_data/reviews.csv")

2. Loading data to df and preprocessing data

In [4]:
review_dir = "cache/scrapped_data/"
review_df = load_data(review_dir)
review_df.head()

Unnamed: 0,customer_rating,customer_review
0,1.0,I loved the face of it and I get lots of compl...
1,1.0,I love it
2,1.0,Its beautiful and as expected I know she will ...
3,1.0,Face of watch is lovely
4,1.0,It is a great product Thank you


In [5]:
# removing neutral ratings
review_df.drop(review_df[review_df['customer_rating'] == 0].index, axis=0, inplace=True)
print(review_df['customer_rating'].value_counts())
review_df.head()

 1.0    500
-1.0     57
Name: customer_rating, dtype: int64


Unnamed: 0,customer_rating,customer_review
0,1.0,I loved the face of it and I get lots of compl...
1,1.0,I love it
2,1.0,Its beautiful and as expected I know she will ...
3,1.0,Face of watch is lovely
4,1.0,It is a great product Thank you


3. Creating custom dataset object and data loader

In [6]:
reviews, labels = review_df['customer_review'], review_df['customer_rating']
review_dataset = create_custom_dataset(reviews.to_numpy(), labels.to_numpy(), max_encoding_len=128, tokenizer_type = "bert-base-cased")

In [7]:
batch_size = 16
review_data_loader = DataLoader(review_dataset, batch_size=batch_size)

In [8]:
len(review_data_loader)

35

4. Switching to GPU

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
classes = ['negative', 'positive']
model = ReviewClassifier(len(classes))
model.load_state_dict(torch.load('cache/trained_model.bin'))
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


5. Predicting sentiment of review text

In [None]:
review_texts, predictions, prediction_probs = predict(model, review_data_loader, device)

In [12]:
score = product_recomendation_score(predictions)
print("product recommendation score: ", score.item())

product recommendation score:  0.9012567400932312


### Text Summarization

In [38]:
pos_review_text = ""
neg_review_text = ""
for id, pred in enumerate(predictions.numpy()):
  if pred == 1:
    pos_review_text += ". "+str(review_texts[id])
  
  if pred == 0:
    neg_review_text += ". "+str(review_texts[id])

pos_review_text = pos_review_text[2:]
neg_review_text = neg_review_text[2:]

In [39]:
from transformers import PegasusForConditionalGeneration
from transformers import PegasusTokenizer

In [None]:
# model_name = "google/pegasus-xsum"
model_name = "google/pegasus-large"

# Load pretrained tokenizer
pegasus_tokenizer = PegasusTokenizer.from_pretrained(model_name)

In [None]:
# Define PEGASUS model
pegasus_model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [49]:
# Create tokens
pos_review_text_tokens = pegasus_tokenizer(pos_review_text, truncation=True, padding="longest", return_tensors="pt")
neg_review_text_tokens = pegasus_tokenizer(neg_review_text, truncation=True, padding="longest", return_tensors="pt")

In [50]:
def summarize(tokens):
  # Summarize text
  encoded_summary = pegasus_model.generate(**tokens, max_length=150, min_length=30)

  # Decode summarized text
  decoded_summary = pegasus_tokenizer.decode(
        encoded_summary[0],
        skip_special_tokens=True
  )

  return decoded_summary

In [51]:
pos_summary = summarize(pos_review_text_tokens)
neg_summary = summarize(neg_review_text_tokens)

In [52]:
pos_summary

'For the price this watch is lovely If you like rose gold you especially will not be disappointed with itIf you like larger watch faces you will also really like this I have very narrow wrists and I still love the size of the face I needed the very last option for fastening and probably could use one more however it really does not slide around much It comes packaged very nicely in a large box fastened around a pillow It arrived in excellent shape It also had a piece of plastic separating the knob so that the battery was preserved which I thought was brilliant I liked the design so much and since it was really sold at an affordable price I ordered the exact same watch in the black I love floral things and really wanted a statement watch with flowers this one fit the'

In [53]:
neg_summary

'I had wanted this watch for a long time so when it was on sale on Amazon Prime Day I couldnt resist buying it I regret buying this gift It is a nice looking watch However after wearing it only a few times the watch band started to fall apart The band has separated in half Im sure I can glue it back together but for the price I would expect it to last more than a few times of wearing I have a 5 watch from Walmart that Ive had for about 2 years and is still in excellent condition after wearing many more times than the Nine West watch I had bought 2 other Nine West watches after this purchase and still love them They both have metal bands If it werent for the cheap band they put on this watch I'