# Text Representation

In [1]:
import json
import pandas as pd
import gzip
import os
from urllib.request import urlopen

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-03-18 20:53:18--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2022-03-18 20:53:19 (16.5 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



# Exercise 1

Load the [metadata file](https://nijianmo.github.io/amazon/index.html) and discard any item that was not rated by our subset of users (nor in training or test sets). Apply preprocessing (stemming and stopwords removal) to clean up the text from the "title". Report the vocabulary size before and after the preprocessing.

In [11]:
import os
import sys
sys.path.append('../')
import pickle
import pandas as pd
import import_ipynb


# Load TRAIN and TEST sets 
from Session_1 import training_data, test_data

# Load the METADATA (ITEMS)
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('meta_All_Beauty.json.gz')
print(len(df))

# Discard duplicates
df = df.drop_duplicates(subset=['asin']).reset_index(drop=True)
print(len(df))

# Discard items that weren't rated by our subset of users
item_in_training = df['asin'].isin(training_data.append(test_data)['asin'])
df = df[item_in_training].reset_index(drop=True)
print(len(df))

32892
32488
84


In [5]:
import nltk
import string 
import re
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

# <YOUR CODE HERE>
# remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

def clean_non_alpha(text):
    return re.sub('[^a-zA-Z]', ' ', text)

#storing the puntuation free text
# df['title_remove']= df['title'].apply(clean_non_alpha)
# df['clean_title']= df['title_remove'].apply(lambda x: x.lower())
df['clean_title']= df['title'].apply(lambda x: word_tokenize(x))

# the vocabulary size before the preprocessing.
vacabulary = set()

for word in df.clean_title.values:
    for voc in word:
        vacabulary.add(voc)

print(len(vacabulary))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
545


In [7]:
stop_words = stopwords.words()

# stopwords removal
def remove_stopwords(text):
    output= [i for i in text if i not in stop_words]
    return output

# remove punctuation
def remove_punctuation(text):
    output= [i for i in text if i not in string.punctuation]
    return output

# remove number
def remove_number(text):
    output = []
    for i in text:
      try:
        float(i)
      except:
        output.append(i)
    return output

def is_alpha(text):
    output = [i for i in text if i.isalpha()]
    return output

def to_lower(text):
    output= [i.lower() for i in text]
    return output

#defining the object for stemming
porter_stemmer = PorterStemmer()

def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

df['clean_title']= df['clean_title'].apply(lambda x: to_lower(x))
df['clean_title']= df['clean_title'].apply(lambda x: is_alpha(x))
df['clean_title'] = df['clean_title'].apply(remove_stopwords)
df['clean_title'] = df['clean_title'].apply(lambda x: stemming(x))
print(df.clean_title)

# the vocabulary size after the preprocessing.
vacabulary = set()
for word in df.clean_title.values:
    for voc in word:
        vacabulary.add(voc)
print(len(vacabulary))

0        [aqua, velva, shave, classic, ice, blue, ounc]
1        [citr, shine, moistur, burst, shampoo, fl, oz]
2                              [nar, blush, taj, mahal]
3     [avalon, organ, wrinkl, therapi, clean, milk, oz]
4                              [bar, ani, lavend, ounc]
                            ...                        
79                        [ultim, lotion, michael, kor]
80          [dolc, amp, gabbana, compact, parfum, ounc]
81    [colgat, kid, maximum, caviti, protect, pump, ...
82    [bali, secret, natur, deodor, organ, amp, vega...
83                    [essi, gel, coutur, nail, polish]
Name: clean_title, Length: 84, dtype: object
368


# Exercise 2

Representation in vector spaces.

## 2.1

Represent all the products from Exercise 1 in a TF-IDF space. Interpret the meaning of the TF-IDF matrix dimensions.

Tip: You may use the library [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

title_list = []
for text in df.clean_title.values:
    title_list.append(TreebankWordDetokenizer().detokenize(text))
tf_idf_matrix = tfidf_vectorizer.fit_transform(title_list)
tf_idf_array = tf_idf_matrix.toarray()

In [9]:
print("TF-IDF matrix shape:", tf_idf_array.shape)

TF-IDF matrix shape: (84, 367)


## 2.2

Compute and the cosine similarity between products with asin 'B000FI4S1E', 'B000LIBUBY' and 'B000W0C07Y'. Take a look at their features to see whether results make sense with their characteristics. 

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
item_weight_matrix = pd.DataFrame(index=df.asin.values, data=tf_idf_array)

similarity_1e_by = cosine_similarity([item_weight_matrix.loc['B000FI4S1E'].values], [item_weight_matrix.loc['B000LIBUBY'].values])
print("Similarity between 'B000FI4S1E'and 'B000LIBUBY':", round(similarity_1e_by[0, 0], 3))

similarity_1e_7y = cosine_similarity([item_weight_matrix.loc['B000FI4S1E'].values], [item_weight_matrix.loc['B000W0C07Y'].values])
print("Similarity between 'B000FI4S1E'and 'B000W0C07Y':", round(similarity_1e_7y[0, 0], 3))

similarity_by_7y = cosine_similarity([item_weight_matrix.loc['B000LIBUBY'].values], [item_weight_matrix.loc['B000W0C07Y'].values])
print("Similarity between 'B000LIBUBY'and 'B000W0C07Y':", round(similarity_by_7y[0, 0], 3))

Similarity between 'B000FI4S1E'and 'B000LIBUBY': 0.038
Similarity between 'B000FI4S1E'and 'B000W0C07Y': 0.029
Similarity between 'B000LIBUBY'and 'B000W0C07Y': 0.421


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 31.3 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

# Exercise 3

Representation in vector spaces with contextual Word Embeddings.

## 3.1.

Represent all the products from Exercise 1 in a vector space using embeddings from a pre-trained BERT model. The final embedding of a product should be the average of the word embeddings from all the words in the 'title'. What is the vocabulary size of the model? What are the dimensions of the last hidden state?

Tip: you may install the transformers library and use their pretrained [BERT model uncased](https://huggingface.co/bert-base-uncased).

In [None]:
# LOAD TRANSFORMER
"""
If you plan on using a pretrained model, it’s important to use the associated 
pretrained tokenizer: it will split the text you give it in tokens the same way
for the pretraining corpus, and it will use the same correspondence
token to index (that we usually call a vocab) as during pretraining.
"""

# % pip install transformers
import torch
import transformers
assert transformers.__version__ > '4.0.0'

from transformers import BertModel, BertTokenizerFast

# set-up environment
# DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {DEVICE}")


modelname = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(modelname)
model = BertModel.from_pretrained(modelname)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Print out the vocabulary size
# <YOUR CODE HERE>
configuration = model.config
print("Vocabulary size of the model:", configuration.vocab_size)
print("Input dimension:", configuration.hidden_size)

Vocabulary size of the model: 30522
Input dimension: 768


In [None]:
# REPRESENT PRODUCTS IN A VECTOR SPACE
def batch_encoding(sentences):
    # Since we're using padding, we need to provide the attention masks to our
    # model. Otherwise it doesn't know which tokens it should not attend to. 

    inputs = tokenizer(sentences, return_tensors="pt", padding=True)
    print(inputs) # Look at the padding and attention_mask 
    # attention mask is a binary tensor indicating the position of the padded indices 

    outputs = model(**inputs)

    last_hidden_states = outputs.last_hidden_state

    return inputs, last_hidden_states

encoded_inputs, title_last_hidden_states = batch_encoding([text for text in df.title.values])

"""
Note that the control token [CLS] has been added 
at the beginning of each sentence, and [SEP] at the end. 
"""

# Now, let's mask out the padding tokens and compute the embedding vector of each product
print("last_hidden_states:", title_last_hidden_states.shape)

{'input_ids': tensor([[  101, 28319,  2310,  ...,     0,     0,     0],
        [  101, 25022,  7913,  ...,     0,     0,     0],
        [  101,  6583,  2869,  ...,     0,     0,     0],
        ...,
        [  101,  8902,  5867,  ...,     0,     0,     0],
        [  101, 20222,  7800,  ...,  1033,   102,     0],
        [  101,  9686, 11741,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
last_hidden_states: torch.Size([84, 52, 768])


In [None]:
# embedding vector of each product
embedding_vectors = torch.mean(title_last_hidden_states, dim=1)
embedding_vectors.shape

torch.Size([84, 768])

In [None]:
encoded_sequence = tokenizer.decode(encoded_inputs["input_ids"][0])
encoded_sequence

'[CLS] aqua velva after shave, classic ice blue, 7 ounce [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

## 3.2.

Compute and the cosine similarity between products with asin 'B000FI4S1E', 'B000LIBUBY' and 'B000W0C07Y'.

In [None]:
item_ids_matrix = pd.DataFrame(index=df.asin.values, data=embedding_vectors)

similarity_1e_by_ids = cosine_similarity([item_ids_matrix.loc['B000FI4S1E'].values], [item_ids_matrix.loc['B000LIBUBY'].values])
print("Similarity between 'B000FI4S1E'and 'B000LIBUBY':", round(similarity_1e_by_ids[0, 0], 3))

similarity_1e_7y_ids = cosine_similarity([item_ids_matrix.loc['B000FI4S1E'].values], [item_ids_matrix.loc['B000W0C07Y'].values])
print("Similarity between 'B000FI4S1E'and 'B000W0C07Y':", round(similarity_1e_7y_ids[0, 0], 3))

similarity_by_7y_ids = cosine_similarity([item_ids_matrix.loc['B000LIBUBY'].values], [item_ids_matrix.loc['B000W0C07Y'].values])
print("Similarity between 'B000LIBUBY'and 'B000W0C07Y':", round(similarity_by_7y_ids[0, 0], 3))

Similarity between 'B000FI4S1E'and 'B000LIBUBY': 0.836
Similarity between 'B000FI4S1E'and 'B000W0C07Y': 0.759
Similarity between 'B000LIBUBY'and 'B000W0C07Y': 0.754
