Follows tutorial here:
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [106]:
import pandas as pd
import numpy as np
import pickle
import re
from os import walk
from os import listdir
from os.path import isfile, join
import json

# Load Data

In [122]:
# a.split("\t")
data_path = '../data/amazon_reviews/amazon_review/'

In [123]:
def load_data(filepath):
    with open(filepath, "r") as file:
        text = file.read()
    reviews = text.split("\n")
    reviews = [i for i in reviews if (i != "") and ("UNCONFIDENT_INTENT_FROM_SLAD" not in i)]
    x = [x.split("\t")[0] for x in reviews]
    y = [x.split("\t")[1] for x in reviews]
    return x, y

In [128]:
set(load_data(data_path + "test/" + "Baby.test")[1])

{'1', '2'}

In [137]:
all_data = {}
for domain in ["train", "dev", "test"]:
    folder_path = data_path + domain + "/"
    onlyfiles = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]
    for file in onlyfiles:
        if (".test" in file) or (".train" in file) or (".test" in file):
            all_data[file] = load_data(folder_path + "/" + file)

In [138]:
all_data.keys()

dict_keys(['CDs_and_Vinyl.train', 'Clothing_Shoes_and_Jewelry.train', 'Home_and_Kitchen.train', 'Beauty.train', 'Sports_and_Outdoors.train', 'Movies_and_TV.train', 'Apps_for_Android.train', 'Cell_Phones_and_Accessories.train', 'Electronics.train', 'Office_Products.train', 'Books.train', 'Health_and_Personal_Care.train', 'Kindle_Store.train', 'Grocery_and_Gourmet_Food.train', 'Pet_Supplies.train', 'Tools_and_Home_Improvement.test', 'Tools_and_Home_Improvement.train', 'Pet_Supplies.test', 'Automotive.test', 'Automotive.train', 'Grocery_and_Gourmet_Food.test', 'Baby.test', 'Video_Games.train', 'Baby.train', 'Digital_Music.test', 'Digital_Music.train', 'Toys_and_Games.train', 'Toys_and_Games.test', 'Video_Games.test'])

In [112]:
with open("../data/all_cleaned/amazon_data_dict.txt","w") as f:
    json.dump(all_data, f, indent=4)

# LDA

In [166]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.stem import *
import numpy as np
np.random.seed(2021)
import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yuchen.zhang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [168]:
freq_used = ['Home_and_Kitchen', 'Books', 'Electronics', 'Movies_and_TV']

In [164]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [158]:
data['Electronics'][1][0]

'1'

In [167]:
preprocess(data['Electronics'][0][0])

['check', 'barn', 'nobl', 'number', 'buyaccessori', 'pay', 'day']

In [168]:
data['Electronics'][0][0]

"Check your SN with Barnes & Noble's 800 number. Mine was used from BuyAccessories. I paid $70 too much as used is under $30 these days."

In [173]:
topics = {}
for i in freq_used:
    processed_train = [preprocess(x) for x in all_data[i + '.train'][0]]
    topics[i] = processed_train

In [174]:
topics.keys()

dict_keys(['Home_and_Kitchen', 'Books', 'Electronics', 'Movies_and_TV'])

In [178]:
# topics['Home_and_Kitchen']

In [177]:
with open("../data/all_cleaned/amazon_topics_dict.txt","w") as f:
    json.dump(topics, f, indent=4)

# Topics

In [55]:
import sys
sys.path.append("../")
from src.utils import *
from src.bert_embedding import *
from transformers import BertTokenizer, BertModel

In [7]:
with open("../data/all_cleaned/amazon_topics_dict.txt","r") as f:
    topics = json.load(f)

In [41]:
with open("../data/all_cleaned/amazon_data_dict.txt","r") as f:
    data = json.load(f)

In [4]:
tokenizer_d = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_d = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [57]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=440473133.0), HTML(value='')))




In [169]:
topics["Home_and_Kitchen"][100]

['receiv', 'piec', 'measur', 'return', 'piec', 'order', 'differ']

In [185]:
data["Home_and_Kitchen"][0][100]

"Only received one piece, the 1/4 cup. WHERE ARE THE OTHER FOUR MEASURER'S ??I will be returning the one piece. I already ordered a different set."

In [35]:
encoded_input = tokenizer_d(topics["Home_and_Kitchen"][0], return_tensors='pt', truncation=True, padding=True)
encoded_sent = tokenizer_d([" ".join(topics["Home_and_Kitchen"][0])], return_tensors='pt', truncation=True, padding=True)

In [36]:
output = model_d(**encoded_input)
output_sent = model_d(**encoded_sent)

In [32]:
output.last_hidden_state

tensor([[[-0.2199, -0.1201, -0.1294,  ..., -0.0373,  0.1549,  0.3760],
         [ 0.1758, -0.6426,  0.0107,  ...,  0.0864,  0.3222,  0.1949],
         [-0.3781, -0.4156,  0.0202,  ...,  0.0596, -0.1450, -0.1192],
         [-0.2415, -0.4868, -0.4642,  ...,  0.2593, -0.1489,  0.2280],
         [ 0.9567,  0.0983, -0.3869,  ...,  0.2064, -0.7347, -0.0875]],

        [[-0.2137, -0.1053, -0.0130,  ..., -0.1954,  0.0230,  0.2719],
         [ 0.5586, -0.1439, -0.1018,  ...,  0.0750, -0.0413,  0.2260],
         [ 0.9042,  0.2337, -0.3188,  ...,  0.2038, -0.6615, -0.1966],
         [ 0.0075,  0.1224,  0.0685,  ...,  0.0251, -0.1264,  0.1510],
         [ 0.0734,  0.0553, -0.0359,  ...,  0.0413, -0.1558,  0.2330]],

        [[-0.3395,  0.0194, -0.1812,  ..., -0.1555,  0.2767,  0.2707],
         [-0.1405,  0.0508, -0.0577,  ..., -0.3683,  0.5464,  0.0723],
         [-0.1438, -0.1855,  0.2111,  ..., -0.2795,  0.1649, -0.3668],
         [-0.2358, -0.4288, -0.7102,  ..., -0.1054,  0.2284, -0.0944],
  

In [33]:
output[0].shape

torch.Size([7, 5, 768])

In [39]:
output_sent[0].shape

torch.Size([1, 14, 768])

In [119]:
n = 10
print(" ".join(topics["Home_and_Kitchen"][10]))
print(data["Home_and_Kitchen"][0][10])

piec trash crush motor sound like explod frustrat tri smoothi replac oster beehiv classic better actual quieter sturdi base
A piece of trash for ice crushing.  Motor sounds like it's about to explode. The cup is just frustration if you're trying to make smoothies. Replaced it with the Oster Beehive classic. Way better and actually quieter because of the sturdy base.


In [118]:
topics_only = " ".join(topics["Home_and_Kitchen"][0])

In [99]:
topics_only

'remedi wilton magazin money idea instruct invest'

In [100]:
topics_only_shuffle = 'remedi magazin wilton money idea instruct invest'

In [81]:
all_data = data["Home_and_Kitchen"][0][0]

In [101]:
encoded_input_shuffle = tokenizer(topics_only_shuffle, return_tensors='pt', truncation=True, padding=True)
output_shuffle= model(**encoded_input)

In [114]:
encoded_input = tokenizer(topics_only, return_tensors='pt', truncation=True, padding=True)
output = model(**encoded_input)

In [87]:
encoded_input_all = tokenizer(all_data, return_tensors='pt', truncation=True, padding=True)
output_all = model(**encoded_input_all)

In [93]:
bert = output[0][:, 0, :].detach().numpy()

In [92]:
bert_all = output_all[0][:, 0, :].detach().numpy()

In [125]:
output_all[0].detach().numpy()

array([[[ 0.14011003,  0.02450287,  0.04106272, ..., -0.5889588 ,
          0.15424612,  0.75548756],
        [-0.42383468, -0.16127616, -0.06394659, ..., -0.64132077,
          1.0293195 ,  0.34360573],
        [-0.1286612 ,  0.16680661,  0.5434309 , ..., -0.666994  ,
          0.29372048,  0.6447649 ],
        ...,
        [ 1.0262218 , -0.21811685, -0.45390967, ..., -0.7425849 ,
          0.3056906 ,  0.2396598 ],
        [ 0.6048785 ,  0.29522845, -0.29891792, ...,  0.0361919 ,
         -0.27819633, -0.3662661 ],
        [ 0.85183954,  0.38817424, -0.1411262 , ..., -0.16613567,
         -0.40873182, -0.27126873]]], dtype=float32)

In [102]:
bert_shuffle = output_shuffle[0][:,0,:].detach().numpy()

In [131]:
bert @ bert_all.T / (np.linalg.norm(bert) * np.linalg.norm(bert_all))

array([[0.77939427]], dtype=float32)

In [120]:
freq_used = ['Home_and_Kitchen', 'Books', 'Electronics', 'Movies_and_TV']

In [127]:
topic_embedd = tokenize_encode_bert_sentences_sample(tokenizer, model, freq_used)

In [141]:
def cos_sim(x1,x2):
    sim = x1 @ x2.T / (np.linalg.norm(x1) * np.linalg.norm(x2))
    return sim

In [147]:
sims = {}
for i, embeddi in enumerate(topic_embedd):
    for j, embeddj in enumerate(topic_embedd):
        name = freq_used[i] + " - " + freq_used[j]
        sims[name] = cos_sim(embeddi, embeddj)

In [148]:
sims

{'Home_and_Kitchen - Home_and_Kitchen': 0.99999994,
 'Home_and_Kitchen - Books': 0.8122904,
 'Home_and_Kitchen - Electronics': 0.7812052,
 'Home_and_Kitchen - Movies_and_TV': 0.8945177,
 'Books - Home_and_Kitchen': 0.8122904,
 'Books - Books': 0.99999994,
 'Books - Electronics': 0.7863649,
 'Books - Movies_and_TV': 0.8110619,
 'Electronics - Home_and_Kitchen': 0.7812052,
 'Electronics - Books': 0.7863649,
 'Electronics - Electronics': 1.0,
 'Electronics - Movies_and_TV': 0.7939116,
 'Movies_and_TV - Home_and_Kitchen': 0.8945177,
 'Movies_and_TV - Books': 0.8110619,
 'Movies_and_TV - Electronics': 0.7939116,
 'Movies_and_TV - Movies_and_TV': 1.0}

In [175]:
all_data = []
for i in data:
    all_data.extend(data[i][0])

In [180]:
list_of_list_of_tokens = [x.split() for x in all_data][0]

In [184]:
# from gensim import corpora, models

# # list_of_list_of_tokens = [["a","b","c"], ["d","e","f"]]
# # ["a","b","c"] are the tokens of document 1, ["d","e","f"] are the tokens of document 2...
# dictionary_LDA = corpora.Dictionary(list_of_list_of_tokens)
# dictionary_LDA.filter_extremes(no_below=3)
# corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in list_of_list_of_tokens]

# num_topics = 50
# %time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
#                                   id2word=dictionary_LDA, \
#                                   passes=4, alpha=[0.01]*num_topics, \
#                                   eta=[0.01]*len(dictionary_LDA.keys()))