# Text Representation

# Exercise 1

Load the [metadata file](https://nijianmo.github.io/amazon/index.html) and discard any item that was not rated by our subset of users (nor in training or test sets). Apply preprocessing (stemming and stopwords removal) to clean up the text from the "title". Report the vocabulary size before and after the preprocessing.

In [1]:
import gzip
import os
import sys
sys.path.append('../')
import pickle
import pandas as pd
import json
import numpy as np

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


train = pd.read_pickle("train.pkl")
test = pd.read_pickle("test.pkl")
meta = getDF('meta_Software.json.gz')
print("Before dropping dups:", len(meta))

meta = meta.drop_duplicates(["asin"], keep="last")
print("After:", len(meta))


Before dropping dups: 26790
After: 21639


In [2]:
# filter out products that are not in train or test sets 
allowed_asins = np.unique(np.array([*train.asin.to_list(), *test.asin.to_list()]))
meta = meta[meta["asin"].isin(allowed_asins)].reset_index(drop=True)

len(allowed_asins), len(meta)

(801, 801)

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

from bs4 import BeautifulSoup
import re
from pattern.en import lemma

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayaya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ayaya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def cleanPrices(x): 
    x = x[1:]  
    try:
        x = float(x)
    except:
        x = 67.02 #mean of non-nan prices
    return x

def countWordsAndLengths(x):
    x = BeautifulSoup(x).get_text()
    x = re.sub("[^a-z]", " ", x.lower())
    x = word_tokenize(x)
    return pd.Series([len(x), len(''.join(x))/len(x)])

def cleanWords(x):
    txt = BeautifulSoup(x).get_text()              #remove html
    letters = re.sub("[^a-z]", " ", txt.lower())   #lowercase and only letters
    words   = word_tokenize(letters)               #tokenize
    stops = set(stopwords.words("english"))        
    meaningful_words = [w for w in words if not w in stops]  #remove stopwords 
    ps = PorterStemmer()                           
    stems = [ps.stem(w) for w in meaningful_words]   #stem
    return ' '.join([w for w in stems if len(w)>1])

df = meta.copy()

df["prices"]  = df["price"].apply(lambda x: cleanPrices(x))

df["all_text"] = df[["title", "brand", "description"]].apply(
    lambda x: x["title"] + " " + x["brand"] + " " + (' '.join(x["description"])), axis=1)

df[["n_words", "avg_word_length"]] = df["all_text"].apply(lambda x: countWordsAndLengths(x))

df["words"] = df["title"].apply(lambda x: cleanWords(x))
# df["all_words"] = df["all_text"].apply(lambda x: cleanWords(x))
df["words"]

## other arbitrary features ##
# df["below50"] =  df["prices"].apply(lambda x: (x<50)*1.)
# df["below150"] =  df["prices"].apply(lambda x: (x<150)*1.)
# .. 
# LIX readability of description, etc... 

0                   learn adob photoshop lightroom video
1      learn adob dreamweav cs video core train web c...
2      learn adob flash profession cs video core trai...
3      microsoft offic home year subscript user pc ma...
4                        niv glo premium dvd multi devic
                             ...                        
796    microsoft offic home student famili pack pc di...
797    corel aftershot pro photo edit softwar pc mac ...
798                               onlinetv free download
799                     pinnacl studio ultim old version
800                       pinnacl studio plu old version
Name: words, Length: 801, dtype: object

In [5]:
# unique words in vocab (excludes 1-letter stems)
len(set(' '.join(df["words"]).split(' ')))

646

# Exercise 2

Representation in vector spaces.

Represent all the products from Exercise 1 in a TF-IDF space. Interpret the meaning of the TF-IDF matrix dimensions.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(df["words"])
tfidf = tfidf_vectorizer.fit_transform(df["words"])
names = tfidf_vectorizer.get_feature_names_out()
arr = tfidf.toarray() 

df2 = pd.DataFrame(arr, index=df["asin"], columns=names)
df2.to_pickle("asin_tfidf1.pkl")
df2

Unnamed: 0_level_0,access,account,acroni,activ,address,adob,advanc,advantag,advisor,aftershot,...,work,world,write,xi,xp,xpvistawin,year,youtub,yr,zonealarm
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0321700945,0.0,0.0,0.0,0.0,0.0,0.416422,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
0321719816,0.0,0.0,0.0,0.0,0.0,0.287289,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
0321719824,0.0,0.0,0.0,0.0,0.0,0.269974,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
0763855553,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.35918,0.0,0.0,0.0
0982697813,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B01F7RJHIQ,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
B01FFVDY9M,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.479046,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
B01H39M7ME,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
B01HAP47PQ,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0


## 2.2

Compute and the cosine similarity between products with asin 'B000FI4S1E', 'B000LIBUBY' and 'B000W0C07Y'. Take a look at their features to see whether results make sense with their characteristics. 

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

asins = ["B01H39M7ME", "B01HAP47PQ", "B01HAP3NUG"]
sims = pd.DataFrame(cosine_similarity(df2.loc[asins]).round(3), index=asins, columns=asins)
sims

Unnamed: 0,B01H39M7ME,B01HAP47PQ,B01HAP3NUG
B01H39M7ME,1.0,0.0,0.0
B01HAP47PQ,0.0,1.0,0.739
B01HAP3NUG,0.0,0.739,1.0


In [8]:
# the two Pinnacle Studio products are similar in tf-idf space, while the onlineTV product shares less similarity 
df[df["asin"].isin(asins)][["description", "title", "brand", "rank", "details", "main_cat", "asin", "words"]]

Unnamed: 0,description,title,brand,rank,details,main_cat,asin,words
798,"[<div>, onlineTV gives you access to hundreds ...",onlineTV Free [Download],concept/design GmbH,[],{'Note:': 'Gifting is not available for this i...,Software,B01H39M7ME,onlinetv free download
799,"[Get the power, creativity and control you nee...",Pinnacle Studio 20 Ultimate (Old Version),Pinnacle Systems,"1,066 in Software (",{'  Product Dimensions: ': '5.2 x 1.2 ...,Software,B01HAP47PQ,pinnacl studio ultim old version
800,[Create your best videos with the pro-quality ...,Pinnacle Studio 20 Plus (Old Version),Pinnacle Systems,"4,181 in Software (",{'  Product Dimensions: ': '5.2 x 1.2 ...,Software,B01HAP3NUG,pinnacl studio plu old version


## Pretrained word2vec (Gogle News 300) 

In [9]:
import gensim.downloader
# https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794
word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')

import time
time.sleep(100) #avoid notebook error when queuing code cells...

In [11]:
# TF-IDF
# - Easy to compute
# - You have some basic metric to extract the most descriptive terms in a document
# - You can easily compute the similarity between 2 documents using it

# Disadvantages:
# - TF-IDF is based on the bag-of-words (BoW) model, therefore it does not capture position in text, semantics, co-occurrences in different documents, etc.
# - For this reason, TF-IDF is only useful as a lexical level feature
# - Cannot capture semantics (e.g. as compared to topic models, word embeddings)

def cleanWords2(x):
    x = BeautifulSoup(x).get_text()         #remove html
    x = re.sub("[^a-z]", " ", x.lower())    #lowercase and only letters
    x = word_tokenize(x)                    #tokenize
    stops = set(stopwords.words("english")) #define stopwords 
    x = [w for w in x if not w in stops] 
    x = [lemma(w) for w in x]               #stem/lemmatize
    return ' '.join([w for w in x if len(w)>1 and w in word2vec_vectors.index_to_key])

# df = meta.copy()
df["words"] = df["title"].apply(lambda x: cleanWords2(x))
# df["words"] = df["all_text"].apply(lambda x: cleanWords2(x))

len(set(' '.join(df["words"]).split(' ')))

543

In [12]:
def get_vector(x):
    x = x.split(' ')
    i = 0
    doc_embedding = np.zeros((300,))
    if x == ['']: return doc_embedding
    for word in x:
        i += 1
        doc_embedding += word2vec_vectors[word]
    return (doc_embedding/i)

arr = [np.array(x) for x in df["words"].apply(get_vector).to_numpy()]
df3 = pd.DataFrame(arr, index=df["asin"], columns=range(300))
df3.to_pickle("asin_tfidf2.pkl")
df3

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0321700945,0.109790,-0.040436,-0.137158,0.220117,-0.069507,0.060547,0.135645,0.096387,-0.030762,-0.037354,...,-0.139258,0.072900,0.011481,0.103516,0.216113,-0.011523,0.034961,-0.129614,-0.065100,0.170508
0321719816,0.024883,-0.006718,-0.042328,0.161110,-0.058266,0.119629,0.029263,0.003815,0.033707,-0.063354,...,-0.050659,0.103565,-0.028122,0.089251,0.060734,-0.083687,0.037777,-0.018997,-0.018507,0.021332
0321719824,0.002255,0.004444,-0.044263,0.068982,-0.063458,0.069971,0.030518,-0.149805,0.017737,0.012646,...,-0.065192,0.004190,-0.059106,0.009880,-0.022641,-0.077301,0.045943,-0.010876,-0.039354,0.003113
0763855553,0.050842,-0.086084,-0.067479,0.139209,-0.060193,0.072449,0.023920,0.021002,0.149023,0.015814,...,-0.057220,0.073804,-0.029626,0.005472,0.101263,-0.096191,-0.079675,-0.050153,-0.066357,0.033606
0982697813,-0.002209,-0.068848,-0.039062,0.110583,-0.042706,0.046338,0.008350,-0.071606,0.029932,0.004736,...,0.020996,0.143896,-0.058105,0.039355,-0.010319,-0.082477,0.030371,-0.099390,0.006140,0.023193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B01F7RJHIQ,0.075887,-0.020331,-0.088053,0.086711,-0.028252,0.050768,0.043620,-0.035997,0.098362,0.033176,...,-0.094889,0.030273,-0.099270,0.024489,0.099799,-0.102675,-0.109165,-0.034155,-0.026706,0.000773
B01FFVDY9M,-0.010037,0.014147,-0.023953,0.144206,-0.071425,0.057953,-0.043884,-0.033834,0.118354,-0.055332,...,-0.093079,0.056112,-0.017171,-0.008640,-0.003377,-0.112684,0.016276,-0.137783,-0.118788,0.095656
B01H39M7ME,0.164551,-0.155518,-0.140808,0.132568,-0.092590,0.199463,0.050537,0.135818,0.248291,0.080261,...,0.120972,0.056885,-0.057129,-0.042175,0.193115,-0.083618,0.026932,-0.187500,0.054562,-0.008789
B01HAP47PQ,0.095996,0.038477,0.007898,0.003149,0.078680,0.115430,0.084961,-0.188672,0.139111,0.147656,...,-0.139551,-0.008838,-0.067139,-0.072412,-0.033936,-0.189355,-0.075232,-0.113428,-0.038135,0.036572


In [13]:
df2[["prices", "n_words", "avg_word_length"]] = df.set_index("asin")[["prices", "n_words", "avg_word_length"]]

# normalize
cols = ["prices", "avg_word_length", "n_words"]
df2[cols] = (df2[cols] - df2[cols].min()) / (df2[cols].max() - df2[cols].min())

df2

Unnamed: 0_level_0,access,account,acroni,activ,address,adob,advanc,advantag,advisor,aftershot,...,xi,xp,xpvistawin,year,youtub,yr,zonealarm,prices,n_words,avg_word_length
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0321700945,0.0,0.0,0.0,0.0,0.0,0.416422,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.073293,0.044517,0.337349
0321719816,0.0,0.0,0.0,0.0,0.0,0.287289,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.103326,0.050851,0.352593
0321719824,0.0,0.0,0.0,0.0,0.0,0.269974,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.196563,0.050127,0.380682
0763855553,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.35918,0.0,0.0,0.0,0.264107,0.010315,0.352273
0982697813,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.196563,0.038545,0.456019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B01F7RJHIQ,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.762523,0.085052,0.325870
B01FFVDY9M,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.479046,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.129047,0.022258,0.370491
B01H39M7ME,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.196563,0.011582,0.294437
B01HAP47PQ,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.104528,0.022982,0.309441


In [14]:
df4 = pd.concat([df2, df3], axis=1)
df4.to_pickle("asin_tfidf3.pkl")
df4.shape

(801, 949)

In [16]:
asins = ["B01H39M7ME", "B01HAP47PQ", "B01HAP3NUG"]
sims2 = pd.DataFrame(cosine_similarity(df3.loc[asins]).round(3), index=asins, columns=asins)
sims2


Unnamed: 0,B01H39M7ME,B01HAP47PQ,B01HAP3NUG
B01H39M7ME,1.0,0.217,0.225
B01HAP47PQ,0.217,1.0,0.918
B01HAP3NUG,0.225,0.918,1.0
