# Text Representation

Please, note that this notebook is intended to be run in Google Colab.

In [1]:
train_file ='train.pkl'
test_file = 'test.pkl'
meta_file = 'meta_All_Beauty.json.gz'

# Exercise 1

Load the [metadata file](https://nijianmo.github.io/amazon/index.html) and discard any item that was not rated by our subset of users (nor in training or test sets). Apply preprocessing (stemming and stopwords removal) to clean up the text from the "title". Report the vocabulary size before and after the preprocessing.

In [2]:
import gzip
import os
import sys
sys.path.append('../')
import pickle
import pandas as pd
import json
import numpy as np

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


train = pd.read_pickle("train.pkl")
test = pd.read_pickle("test.pkl")


meta = getDF(meta_file)
print("Before dropping dups:", len(meta))
meta = meta.drop_duplicates(["asin"], keep="last")
print("After:", len(meta))


Before dropping dups: 32892
After: 32488


In [3]:
allowed_asins = np.unique(np.array([*train.asin.to_list(), *test.asin.to_list()]))
meta = meta[meta["asin"].isin(allowed_asins)].reset_index(drop=True)

len(allowed_asins), len(meta)

(84, 84)

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayaya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ayaya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ayaya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ayaya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
wnl = WordNetLemmatizer()
s = PorterStemmer()

[wnl.lemmatize(w) for w in ["hello", "hi", "jumped"]]

['hello', 'hi', 'jumped']

In [6]:
from bs4 import BeautifulSoup
import re
from pattern.en import lemma

# based on https://stackoverflow.com/questions/45670532/stemming-words-with-nltk-python 
def cleanWords(x, stemmer="porter"):
    #choose stemming/lemmatization 
    if stemmer == "lemma":
        stem = lambda w: lemma(w)
    else: 
        f = PorterStemmer()
        stem = lambda w: f.stem(w)
        
    txt = BeautifulSoup(x).get_text()              #remove html
    letters = re.sub("[^a-z]", " ", txt.lower())   #lowercase and only letters
    words   = word_tokenize(letters)               #tokenize
    stops = set(stopwords.words("english"))        #define stopwords 
    meaningful_words = [w for w in words if not w in stops] 
    stems = [stem(w) for w in meaningful_words]    #stem/lemmatize
    
    return ' '.join([w for w in stems if len(w)>1])

df = meta.copy()
df["words"] = df["title"].apply(lambda x: cleanWords(x))

df["words"]

0                aqua velva shave classic ice blue ounc
1                citr shine moistur burst shampoo fl oz
2                                   nar blush taj mahal
3        avalon organ wrinkl therapi coq cleans milk oz
4                          zum zum bar anis lavend ounc
                            ...                        
79                     ultim bodi lotion michael kor oz
80                     dolc gabbana compact parfum ounc
81    colgat kid maximum caviti protect pump toothpa...
82    bali secret natur deodor organ vegan women men...
83                          essi gel coutur nail polish
Name: words, Length: 84, dtype: object

In [7]:
# unique words in vocab (excludes 1-letter stems)
len(set(' '.join(df["words"]).split(' ')))

394

# Exercise 2

Representation in vector spaces.

## 2.1

Represent all the products from Exercise 1 in a TF-IDF space. Interpret the meaning of the TF-IDF matrix dimensions.

Tip: You may use the library [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(df["words"])
tfidf = tfidf_vectorizer.fit_transform(df["words"])
names = tfidf_vectorizer.get_feature_names_out()
arr = tfidf.toarray() 

df2 = pd.DataFrame(arr, index=df["asin"], columns=names)
df2.to_pickle("asin_tfidf.pkl")
df2

Unnamed: 0_level_0,action,advanc,aerosol,age,ageless,air,allergen,almond,american,andal,...,wintergreen,wiseway,witch,women,wood,work,wrinkl,yardley,youth,zum
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0000530HU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
B00006L9LC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
B00021DJ32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
B0002JHI1I,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.351338,0.0,0.0,0.000000
B0006O10P4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.782655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B019LAI4HU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
B019V2KYZS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
B01BNEYGQU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
B01DKQAXC0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.227715,0.0,0.0,0.000000,0.0,0.0,0.000000


## 2.2

Compute and the cosine similarity between products with asin 'B000FI4S1E', 'B000LIBUBY' and 'B000W0C07Y'. Take a look at their features to see whether results make sense with their characteristics. 

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

asins = ["B000FI4S1E", "B000LIBUBY", "B000W0C07Y"]
sims = pd.DataFrame(cosine_similarity(df2.loc[asins]).round(3), index=asins, columns=asins)
sims

Unnamed: 0,B000FI4S1E,B000LIBUBY,B000W0C07Y
B000FI4S1E,1.0,0.034,0.025
B000LIBUBY,0.034,1.0,0.441
B000W0C07Y,0.025,0.441,1.0


In [10]:
#the two perfumes are similar in tf-idf space, while the shower gel shares less similarity 
df[df["asin"].isin(asins)][["description", "title", "brand", "rank", "details", "main_cat", "asin", "words"]]

Unnamed: 0,description,title,brand,rank,details,main_cat,asin,words
6,[Fruits &amp; Passion SOLSTIS Refreshing Showe...,Fruits &amp; Passion Blue Refreshing Shower Ge...,Fruits & Passion,"2,539,624 in Beauty & Personal Care (","{'Shipping Weight:': '8 ounces', 'ASIN: ': 'B0...",All Beauty,B000FI4S1E,fruit passion blue refresh shower gel fl oz
10,[<li>A brilliant effervescent fragrance for yo...,"Fresh Eau de Parfum, Sugar Lemon, 3.4 oz",Fresh,"572,901 in Beauty & Personal Care (","{'  Item Weight: ': '3.36 ounces', 'Sh...",All Beauty,B000LIBUBY,fresh eau de parfum sugar lemon oz
17,"[, For Women]",Sex In The City Kiss by Instyle Parfums Eau De...,Instyle Parfums,"616,259 in Beauty & Personal Care (","{'  Item Weight: ': '0.8 ounces', 'Shi...",All Beauty,B000W0C07Y,sex citi kiss instyl parfum eau de parfum spra...


## Pretrained word2vec (Gogle News 300) 

In [11]:
import gensim.downloader
# https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794
word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')


In [20]:
# TF-IDF
# - Easy to compute
# - You have some basic metric to extract the most descriptive terms in a document
# - You can easily compute the similarity between 2 documents using it

# Disadvantages:
# - TF-IDF is based on the bag-of-words (BoW) model, therefore it does not capture position in text, semantics, co-occurrences in different documents, etc.
# - For this reason, TF-IDF is only useful as a lexical level feature
# - Cannot capture semantics (e.g. as compared to topic models, word embeddings)


def cleanWords2(x):
    txt = BeautifulSoup(x).get_text()              #remove html
    letters = re.sub("[^a-z]", " ", txt.lower())   #lowercase and only letters
    words   = word_tokenize(letters)               #tokenize
    stops = set(stopwords.words("english"))        #define stopwords 
    meaningful_words = [w for w in words if not w in stops] 
    stems = [lemma(w) for w in meaningful_words]    #stem/lemmatize
    return ' '.join([w for w in stems if len(w)>1 and w in word2vec_vectors.index_to_key])

df = meta.copy()
df["words"] = df["title"].apply(lambda x: cleanWords2(x))

len(set(' '.join(df["words"]).split(' ')))

351

In [19]:
def get_vector(x):
    x = x.split(' ')
    i = 0
    doc_embedding = np.zeros((300,))
    for word in x:
        i += 1
        doc_embedding += word2vec_vectors[word]
    return (doc_embedding/i)

arr = [np.array(x) for x in df["words"].apply(get_vector).to_numpy()]
df3 = pd.DataFrame(arr, index=df["asin"], columns=range(300))
df3.to_pickle("asin_tfidf2.pkl")
df3

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0000530HU,-0.064453,0.015808,0.087301,0.156169,-0.024231,0.083649,0.074621,-0.236328,0.081584,0.167074,...,-0.006388,-0.121243,-0.064653,-0.003204,0.110291,-0.121277,0.056742,0.026123,0.023010,-0.108927
B00006L9LC,-0.037109,0.143555,0.008291,0.017334,0.072917,0.038086,0.037287,-0.238973,0.050049,0.190755,...,0.074056,-0.020447,0.041300,-0.130086,0.007345,-0.087158,-0.084208,0.034382,-0.002218,-0.054127
B00021DJ32,-0.005325,0.084473,0.081787,0.202393,-0.100220,-0.060181,0.141602,-0.001938,-0.039429,0.140015,...,0.054565,0.029846,0.001984,-0.049034,-0.091888,-0.162369,-0.091736,0.048187,0.019043,0.219971
B0002JHI1I,-0.055629,0.149658,0.059636,0.166574,-0.071777,0.182216,0.039246,-0.304408,-0.035784,0.102295,...,-0.058446,-0.105325,-0.039400,0.013044,-0.023751,-0.016785,0.004395,0.132551,0.160993,-0.078753
B0006O10P4,0.120687,-0.180176,-0.060542,0.074137,0.035075,0.212133,0.170166,-0.273600,0.055583,0.221598,...,0.078613,-0.063639,0.029297,0.115031,0.042175,-0.046038,0.019613,0.010091,0.196696,0.172689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B019LAI4HU,-0.116750,0.032349,0.085856,0.114583,-0.033254,0.094320,0.042837,-0.283081,0.089238,0.036402,...,-0.029602,-0.067546,0.027952,0.027059,-0.156708,-0.063965,-0.133169,-0.026825,-0.084595,0.042480
B019V2KYZS,0.032776,-0.047424,0.035645,0.217041,-0.029999,0.160461,0.098083,-0.217773,0.115356,0.181702,...,-0.091675,-0.115889,-0.016052,-0.040283,-0.019806,0.023926,0.034149,-0.015625,0.055420,0.058533
B01BNEYGQU,-0.010361,-0.005264,0.037292,0.078068,-0.026947,0.114807,0.034210,-0.139999,0.109718,0.105042,...,-0.013245,-0.066833,-0.160889,-0.022095,-0.101013,0.054108,-0.060120,0.020233,-0.002808,-0.116760
B01DKQAXC0,-0.025289,0.069427,0.044976,0.144896,-0.053573,0.020009,-0.014752,-0.181085,0.061490,0.161411,...,-0.000699,-0.040436,-0.052290,-0.021389,-0.001712,-0.049068,-0.065881,0.033488,0.063789,0.061069


In [None]:
########
asdas

OPTIONAL

|

|

|

|





# Exercise 3

Representation in vector spaces with contextual Word Embeddings.

## 3.1.

Represent all the products from Exercise 1 in a vector space using embeddings from a pre-trained BERT model. The final embedding of a product should be the average of the word embeddings from all the words in the 'title'. What is the vocabulary size of the model? What are the dimensions of the last hidden state?

Tip: you may install the transformers library and use their pretrained [BERT model uncased](https://huggingface.co/bert-base-uncased).

In [None]:
# LOAD TRANSFORMER
"""
If you plan on using a pretrained model, it’s important to use the associated 
pretrained tokenizer: it will split the text you give it in tokens the same way
for the pretraining corpus, and it will use the same correspondence
token to index (that we usually call a vocab) as during pretraining.
"""

# % pip install transformers
import torch
import transformers
assert transformers.__version__ > '4.0.0'

from transformers import BertModel, BertTokenizerFast

# set-up environment
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")


modelname = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(modelname)
model = BertModel.from_pretrained(modelname).to(DEVICE)

# Print out the vocabulary size
# <YOUR CODE HERE>

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)


In [None]:
# REPRESENT PRODUCTS IN A VECTOR SPACE


def batch_encoding(sentences):
    # Since we're using padding, we need to provide the attention masks to our
    # model. Otherwise it doesn't know which tokens it should not attend to. 
    inputs = # <YOUR CODE HERE>
    # print(inputs) # Look at the padding and attention_mask

    outputs = model(**inputs)

    last_hidden_states = # <YOUR CODE HERE>

    return inputs, last_hidden_states
  
encoded_inputs, title_last_hidden_states = batch_encoding( # <YOUR CODE HERE> )

"""
Note that the control token [CLS] has been added 
at the beginning of each sentence, and [SEP] at the end. 
"""

# Now, let's mask out the padding tokens and compute the embedding vector of each product

# <YOUR CODE HERE>

## 3.2.

Compute and the cosine similarity between products with asin 'B000FI4S1E', 'B000LIBUBY' and 'B000W0C07Y'.