#### Imports

In [1]:
from collections import Counter

import copy
import nltk
import pickle
import gensim
import multiprocessing
from itertools import combinations

import numpy as np
import pandas as pd

In [2]:
import utils as my_utils

### Required Methods

In [3]:
dataset = pd.read_pickle("datasets/datadf_amazon_musical")

In [4]:
dataset.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,unixReviewTime,reviewText,sentiment,reviewTime,summary,cleaned,text
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]",1393545600,"Not much to write about here, but it does exac...",5.0,"02 28, 2014",good,"[much, write, doe, exactly, supposed, filter, ...",much write doe exactly supposed filter pop sou...
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",1363392000,The product does exactly as it should and is q...,5.0,"03 16, 2013",Jake,"[product, doe, exactly, quite, affordable, rea...",product doe exactly quite affordable realized ...


In [5]:
count_matrix, _, vocabulary, words = my_utils.processReviews(dataset['text'].values)

In [6]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [7]:
%%time
embeddings_index = loadGloveModel("nongit_resources/glove.6B.300d.txt")

Loading Glove Model
('Done.', 400000, ' words loaded!')
CPU times: user 34.1 s, sys: 838 ms, total: 35 s
Wall time: 35 s


In [8]:
# %%time
# embeddings_index = gensim.models.KeyedVectors.load_word2vec_format("nongit_resources/wiki-news-300d-1M.vec")

In [9]:
words_embeddings = {}
for i in words:
    try:
        words_embeddings[i] = embeddings_index[i]
    except:
        pass

In [10]:
embeddings_index = None

In [11]:
len(words_embeddings)

1984

In [12]:
words_with_embeddings = words_embeddings.keys()

In [13]:
%%time
edge_embeds_multi = []
for i, j in combinations(words_with_embeddings, 2):
    edge_embeds_multi.append((words_embeddings[i], words_embeddings[j]))

CPU times: user 576 ms, sys: 47.6 ms, total: 624 ms
Wall time: 623 ms


In [14]:
len(edge_embeds_multi)

1967136

In [15]:
n_cores = 25

In [16]:
%%time
pool = multiprocessing.Pool(n_cores)
embeddings_cosines = pool.map(my_utils.get_cosine_multi, edge_embeds_multi)
pool.close()

CPU times: user 4.69 s, sys: 1.12 s, total: 5.81 s
Wall time: 11.1 s


In [17]:
len(embeddings_cosines)

1967136

In [18]:
%%time
edge_embeddings = {}
for idx, (i, j) in enumerate(combinations(words_with_embeddings, 2)):
    edge_embeddings[(i, j)] = embeddings_cosines[idx]
    edge_embeddings[(j, i)] = embeddings_cosines[idx]

CPU times: user 1.88 s, sys: 296 ms, total: 2.17 s
Wall time: 2.17 s


In [19]:
def get_edges_per_doc(doc):
    edges, edges_all = [], []
    for i in doc:
        for j in doc:
            if i != j and i in words_with_embeddings and j in words_with_embeddings:
                sim = edge_embeddings[(i, j)]
                if sim > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
                    edges.append((vocabulary[i], vocabulary[j]))
                    edges_all.append((i, j, sim))
    return (edges, edges_all)

In [42]:
edges_threshold = 0.40

In [43]:
%%time
pool = multiprocessing.Pool(n_cores)
docs_edges_multi = pool.map(get_edges_per_doc, dataset['cleaned'].values)
pool.close()

CPU times: user 4.9 s, sys: 2.39 s, total: 7.29 s
Wall time: 19min 20s


In [44]:
docs_edges = [i[0] for i in docs_edges_multi]

In [45]:
docs_edges_all = [i[1] for i in docs_edges_multi]

In [46]:
dataset['text'].values[1]

u'product doe exactly quite affordable realized wa double screened arrived wa even better expected added bonus one screen carry small hint smell old grape candy used buy reminiscent sake cannot stop putting pop filter next nose smelling recording dif needed pop filter work well expensive one may even come pleasing aroma like mine buy product'

In [47]:
docs_edges_all[1]

[('product', 'better', 0.4106991194869558),
 ('product', 'expensive', 0.40069318015930255),
 ('exactly', 'quite', 0.5669255403241442),
 ('exactly', 'realized', 0.5023455903217898),
 ('exactly', 'even', 0.5168126051843092),
 ('exactly', 'better', 0.47993142534283384),
 ('exactly', 'one', 0.455190065245602),
 ('exactly', 'cannot', 0.47477393208222185),
 ('exactly', 'next', 0.4074414185899642),
 ('exactly', 'needed', 0.4299004829874379),
 ('exactly', 'come', 0.5472758726519136),
 ('exactly', 'like', 0.45478411224838466),
 ('quite', 'realized', 0.5439656449934229),
 ('quite', 'even', 0.6250165124916807),
 ('quite', 'better', 0.5439010618787582),
 ('quite', 'one', 0.4234959319259448),
 ('quite', 'small', 0.42263571575013004),
 ('quite', 'cannot', 0.45586089370702265),
 ('quite', 'work', 0.4092047894123514),
 ('quite', 'well', 0.6196752969258256),
 ('quite', 'expensive', 0.4197722398623449),
 ('quite', 'come', 0.5154173387111795),
 ('quite', 'like', 0.5015904961541011),
 ('affordable', 'buy'

In [48]:
np.median([len(i) for i in docs_edges])

36.0

In [49]:
np.mean([len(i) for i in docs_edges])

138.80407960199005

In [50]:
np.where(np.array([len(i) for i in docs_edges]) == 0)[0].shape

(37,)

In [51]:
Counter([len(i) for i in docs_edges])

Counter({0: 37,
         1: 86,
         2: 127,
         3: 166,
         4: 151,
         5: 227,
         6: 215,
         7: 225,
         8: 215,
         9: 214,
         10: 245,
         11: 187,
         12: 201,
         13: 202,
         14: 166,
         15: 178,
         16: 156,
         17: 143,
         18: 128,
         19: 152,
         20: 127,
         21: 116,
         22: 116,
         23: 125,
         24: 118,
         25: 93,
         26: 87,
         27: 80,
         28: 101,
         29: 74,
         30: 95,
         31: 81,
         32: 80,
         33: 91,
         34: 81,
         35: 81,
         36: 71,
         37: 77,
         38: 49,
         39: 54,
         40: 53,
         41: 47,
         42: 51,
         43: 54,
         44: 67,
         45: 55,
         46: 65,
         47: 54,
         48: 38,
         49: 43,
         50: 54,
         51: 40,
         52: 44,
         53: 41,
         54: 30,
         55: 38,
         56: 48,
         57: 36,


In [52]:
pickle_out = open("resources/edges_amazon_musical_glove_nontrained_0.40.pickle","wb")
pickle.dump(docs_edges, pickle_out)
pickle_out.close()

# Appendix

In [31]:
# %%time
# edges_threshold = 0.73
# docs_edges = []
# docs_edges_all = []
# for idx, doc in enumerate(dataset[10].values):
#     my_utils.print_if_mod(idx, 500)
#     edges, edges_all = [], []
#     for i in doc:
#         for j in doc:
#             if i != j:
#                 sim = edge_embeddings[(i, j)]
#                 if sim > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
#                     edges.append((vocabulary[i], vocabulary[j]))
#                     edges_all.append((i, j, sim))
#     docs_edges.append(edges)
#     docs_edges_all.append(edges_all)

In [32]:
# def loadGloveModel(gloveFile):
#     print("Loading Glove Model")
#     f = open(gloveFile,'r')
#     model = {}
#     for line in f:
#         splitLine = line.split()
#         word = splitLine[0]
#         embedding = np.array([float(val) for val in splitLine[1:]])
#         model[word] = embedding
#     print("Done.",len(model)," words loaded!")
#     return model

In [33]:
# %%time
# embeddings_index = loadGloveModel("nongit_resources/glove.42B.300d.txt")

In [34]:
# %%time
# embeddings_index = gensim.models.KeyedVectors.load_word2vec_format("nongit_resources/electronics.vec")

In [35]:
# %%time
# edges_threshold = 0.8
# docs_edges, ignored, taken, count = [], [], [], 0
# for idx, doc in enumerate(dataset[8].values):
#     edges = []
#     print(idx)
#     for i in doc:
#         for j in doc:
#             if i != j:
#                 try:
#                     a = embeddings_index[i]
#                     b = embeddings_index[j]
#                     if get_cosine(a, b) > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
#                         edges.append((vocabulary[i], vocabulary[j]))
#                 except:
#                     try:
#                         embeddings_index[i]
#                         taken.append(i)
#                     except:
#                         ignored.append(i)
#                     try:
#                         embeddings_index[j]
#                     except:
#                         ignored.append(j)
#                         taken.append(j)
#                     pass
#     docs_edges.append(edges)

In [36]:
# pickle_out = open("resources/docs_edges_" + dataset_name + "_5k_fasttext_trained.pickle","wb")
# pickle.dump(docs_edges, pickle_out)
# pickle_out.close()

In [37]:
## Sentence wise
# dataset = dataset[['asin', 'helpful', 'overall', 'reviewText']]
# dataset['n_words'] = dataset['reviewText'].apply(lambda x: len(w_tokenizer.tokenize(x)))
# dataset['sentences'] = dataset['reviewText'].apply(lambda x: [i.strip() for i in x.split(".")])
# dataset['sentence_word_density'] = dataset['reviewText'].apply(lambda x: len(w_tokenizer.tokenize(x))/ len(x.split(".")))
# dataset.to_csv("reviews_Musical_Instruments_5.csv")

In [38]:
# edge_dict, ignored, taken, count = {}, [], [], 0
# for idxi, i in enumerate(vocabulary.keys()):
#     print(idxi)
#     for idxj, j in enumerate(vocabulary.keys()):
#         if i != j:
#             try:
#                 a = embeddings_index[i]
#                 b = embeddings_index[j]
#                 if get_cosine(a, b) > edges_threshold:
#                     try:
#                         edge_dict[vocabulary[i]] += [vocabulary[j]]
#                         edge_dict[vocabulary[j]] += [vocabulary[i]]
#                     except:
#                         edge_dict[vocabulary[i]] = [vocabulary[j]]
#                         edge_dict[vocabulary[j]] = [vocabulary[i]]
#             except:
#                 try:
#                     embeddings_index[i]
#                     taken.append(i)
#                 except:
#                     ignored.append(i)
#                 try:
#                     embeddings_index[j]
#                 except:
#                     ignored.append(j)
#                     taken.append(j)
#                 pass

In [39]:
# df = {}

# for idx, i in enumerate(dataset[8].values):
#     print(idx)
#     for j in i:
#         try:
#             df[j] += [idx]
#         except:
#             df[j] = [idx]

# for i in df.keys():
#     df[i] = len(list(set(df[i])))

# df_vector = []
# for i in dataset[8].values:
#     d = [0]*len(vocabulary.keys())
#     for j in i:
#         if j in vocabulary.keys():
#             d[vocabulary[j]] = df[j]
#     df_vector.append(d)

# csr = sparse.csr_matrix(np.array(df_vector))
# scipy.sparse.save_npz('resources/df_stackoverflow_5kanswers.npz', csr)

In [40]:
# np.array(scipy.sparse.load_npz('resources/df_stackoverflow_5kanswers.npz').todense())

In [41]:
# dataset = parse("nongit_resources/reviews_Electronics_5.json.gz")
# dataset = pd.DataFrame(list(dataset))
# dataset = dataset.head(N_docs)
# dataset.to_pickle("resources/reviews_Electronics_5")