#### Imports

In [1]:
from collections import Counter

import copy
import nltk
import pickle
import gensim
import multiprocessing
from itertools import combinations

import numpy as np
import pandas as pd

In [2]:
import utils as my_utils

### Required Methods

In [3]:
dataset = pd.read_pickle("datasets/datadf_amazon_musical")

In [4]:
dataset.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,unixReviewTime,reviewText,sentiment,reviewTime,summary,cleaned,text
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]",1393545600,"Not much to write about here, but it does exac...",5.0,"02 28, 2014",good,"[much, write, doe, exactly, supposed, filter, ...",much write doe exactly supposed filter pop sou...
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",1363392000,The product does exactly as it should and is q...,5.0,"03 16, 2013",Jake,"[product, doe, exactly, quite, affordable, rea...",product doe exactly quite affordable realized ...


In [5]:
count_matrix, _, vocabulary, words = my_utils.processReviews(dataset['text'].values)

In [6]:
# def loadGloveModel(gloveFile):
#     print("Loading Glove Model")
#     f = open(gloveFile,'r')
#     model = {}
#     for line in f:
#         splitLine = line.split()
#         word = splitLine[0]
#         embedding = np.array([float(val) for val in splitLine[1:]])
#         model[word] = embedding
#     print("Done.",len(model)," words loaded!")
#     return model

In [7]:
# %%time
# embeddings_index = loadGloveModel("nongit_resources/glove.6B.300d.txt")

In [8]:
%%time
embeddings_index = gensim.models.KeyedVectors.load_word2vec_format("nongit_resources/wiki-news-300d-1M.vec")

CPU times: user 4min 12s, sys: 3.13 s, total: 4min 15s
Wall time: 4min 15s


In [9]:
words_embeddings = {}
for i in words:
    try:
        words_embeddings[i] = embeddings_index[i]
    except:
        pass

In [10]:
embeddings_index = None

In [11]:
len(words_embeddings)

1966

In [12]:
words_with_embeddings = words_embeddings.keys()

In [13]:
%%time
edge_embeds_multi = []
for i, j in combinations(words_with_embeddings, 2):
    edge_embeds_multi.append((words_embeddings[i], words_embeddings[j]))

CPU times: user 470 ms, sys: 64 ms, total: 534 ms
Wall time: 533 ms


In [14]:
len(edge_embeds_multi)

1931595

In [15]:
n_cores = 30

In [16]:
%%time
pool = multiprocessing.Pool(n_cores)
embeddings_cosines = pool.map(my_utils.get_cosine_multi, edge_embeds_multi)
pool.close()

CPU times: user 5.41 s, sys: 1.62 s, total: 7.03 s
Wall time: 12.7 s


In [17]:
len(embeddings_cosines)

1931595

In [18]:
%%time
edge_embeddings = {}
for idx, (i, j) in enumerate(combinations(words_with_embeddings, 2)):
    edge_embeddings[(i, j)] = embeddings_cosines[idx]
    edge_embeddings[(j, i)] = embeddings_cosines[idx]

CPU times: user 1.87 s, sys: 344 ms, total: 2.21 s
Wall time: 2.24 s


In [19]:
def get_edges_per_doc(doc):
    edges, edges_all = [], []
    for i in doc:
        for j in doc:
            if i != j and i in words_with_embeddings and j in words_with_embeddings:
                sim = edge_embeddings[(i, j)]
                if sim > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
                    edges.append((vocabulary[i], vocabulary[j]))
                    edges_all.append((i, j, sim))
    return (edges, edges_all)

In [None]:
edges_threshold = 0.00

In [None]:
%%time
pool = multiprocessing.Pool(n_cores)
docs_edges_multi = pool.map(get_edges_per_doc, dataset['cleaned'].values)
pool.close()

In [None]:
docs_edges = [i[0] for i in docs_edges_multi]

In [None]:
docs_edges_all = [i[1] for i in docs_edges_multi]

In [None]:
dataset['text'].values[1]

In [None]:
docs_edges_all[1]

In [None]:
len(docs_edges_all[1])

In [None]:
np.median([len(i) for i in docs_edges])

In [None]:
np.mean([len(i) for i in docs_edges])

In [None]:
np.where(np.array([len(i) for i in docs_edges]) == 0)[0].shape

In [None]:
Counter([len(i) for i in docs_edges])

In [None]:
pickle_out = open("resources/amazon_musical_glove_nontrained_0.00.pickle","wb")
pickle.dump(docs_edges, pickle_out)
pickle_out.close()

In [None]:
for edges_threshold in [0.60, 0.50, 0.40, 0.30, 0.20, 0.10, 0.00]:
    pool = multiprocessing.Pool(n_cores)
    docs_edges_multi = pool.map(get_edges_per_doc, dataset['cleaned'].values)
    pool.close()

    docs_edges = [i[0] for i in docs_edges_multi]

    docs_edges_all = [i[1] for i in docs_edges_multi]

    pickle_out = open("resources/amazon_musical_fasttext_" + str(edges_threshold) + ".pickle","wb")
    pickle.dump(docs_edges, pickle_out)
    pickle_out.close()

# Appendix

In [None]:
# %%time
# edges_threshold = 0.73
# docs_edges = []
# docs_edges_all = []
# for idx, doc in enumerate(dataset[10].values):
#     my_utils.print_if_mod(idx, 500)
#     edges, edges_all = [], []
#     for i in doc:
#         for j in doc:
#             if i != j:
#                 sim = edge_embeddings[(i, j)]
#                 if sim > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
#                     edges.append((vocabulary[i], vocabulary[j]))
#                     edges_all.append((i, j, sim))
#     docs_edges.append(edges)
#     docs_edges_all.append(edges_all)

In [None]:
# def loadGloveModel(gloveFile):
#     print("Loading Glove Model")
#     f = open(gloveFile,'r')
#     model = {}
#     for line in f:
#         splitLine = line.split()
#         word = splitLine[0]
#         embedding = np.array([float(val) for val in splitLine[1:]])
#         model[word] = embedding
#     print("Done.",len(model)," words loaded!")
#     return model

In [None]:
# %%time
# embeddings_index = loadGloveModel("nongit_resources/glove.42B.300d.txt")

In [None]:
# %%time
# embeddings_index = gensim.models.KeyedVectors.load_word2vec_format("nongit_resources/electronics.vec")

In [None]:
# %%time
# edges_threshold = 0.8
# docs_edges, ignored, taken, count = [], [], [], 0
# for idx, doc in enumerate(dataset[8].values):
#     edges = []
#     print(idx)
#     for i in doc:
#         for j in doc:
#             if i != j:
#                 try:
#                     a = embeddings_index[i]
#                     b = embeddings_index[j]
#                     if get_cosine(a, b) > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
#                         edges.append((vocabulary[i], vocabulary[j]))
#                 except:
#                     try:
#                         embeddings_index[i]
#                         taken.append(i)
#                     except:
#                         ignored.append(i)
#                     try:
#                         embeddings_index[j]
#                     except:
#                         ignored.append(j)
#                         taken.append(j)
#                     pass
#     docs_edges.append(edges)

In [None]:
# pickle_out = open("resources/docs_edges_" + dataset_name + "_5k_fasttext_trained.pickle","wb")
# pickle.dump(docs_edges, pickle_out)
# pickle_out.close()

In [None]:
## Sentence wise
# dataset = dataset[['asin', 'helpful', 'overall', 'reviewText']]
# dataset['n_words'] = dataset['reviewText'].apply(lambda x: len(w_tokenizer.tokenize(x)))
# dataset['sentences'] = dataset['reviewText'].apply(lambda x: [i.strip() for i in x.split(".")])
# dataset['sentence_word_density'] = dataset['reviewText'].apply(lambda x: len(w_tokenizer.tokenize(x))/ len(x.split(".")))
# dataset.to_csv("reviews_Musical_Instruments_5.csv")

In [None]:
# edge_dict, ignored, taken, count = {}, [], [], 0
# for idxi, i in enumerate(vocabulary.keys()):
#     print(idxi)
#     for idxj, j in enumerate(vocabulary.keys()):
#         if i != j:
#             try:
#                 a = embeddings_index[i]
#                 b = embeddings_index[j]
#                 if get_cosine(a, b) > edges_threshold:
#                     try:
#                         edge_dict[vocabulary[i]] += [vocabulary[j]]
#                         edge_dict[vocabulary[j]] += [vocabulary[i]]
#                     except:
#                         edge_dict[vocabulary[i]] = [vocabulary[j]]
#                         edge_dict[vocabulary[j]] = [vocabulary[i]]
#             except:
#                 try:
#                     embeddings_index[i]
#                     taken.append(i)
#                 except:
#                     ignored.append(i)
#                 try:
#                     embeddings_index[j]
#                 except:
#                     ignored.append(j)
#                     taken.append(j)
#                 pass

In [None]:
# df = {}

# for idx, i in enumerate(dataset[8].values):
#     print(idx)
#     for j in i:
#         try:
#             df[j] += [idx]
#         except:
#             df[j] = [idx]

# for i in df.keys():
#     df[i] = len(list(set(df[i])))

# df_vector = []
# for i in dataset[8].values:
#     d = [0]*len(vocabulary.keys())
#     for j in i:
#         if j in vocabulary.keys():
#             d[vocabulary[j]] = df[j]
#     df_vector.append(d)

# csr = sparse.csr_matrix(np.array(df_vector))
# scipy.sparse.save_npz('resources/df_stackoverflow_5kanswers.npz', csr)

In [None]:
# np.array(scipy.sparse.load_npz('resources/df_stackoverflow_5kanswers.npz').todense())

In [None]:
# dataset = parse("nongit_resources/reviews_Electronics_5.json.gz")
# dataset = pd.DataFrame(list(dataset))
# dataset = dataset.head(N_docs)
# dataset.to_pickle("resources/reviews_Electronics_5")