#### Imports

In [1]:
from collections import Counter

import copy
import nltk
import pickle
import gensim
import multiprocessing
from itertools import combinations

import numpy as np
import pandas as pd

In [2]:
import utils as my_utils

### Required Methods

In [3]:
dataset = pd.read_pickle("datasets/datadf_amazon_patio")

In [4]:
dataset.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,sentiment,summary,unixReviewTime,reviewTime,cleaned,text
0,A1JZFGZEZVWQPY,B00002N674,"Carter H ""1amazonreviewer@gmail . com""","[4, 4]",Good USA company that stands behind their prod...,4.0,Great Hoses,1308614400,"06 21, 2011","[good, usa, company, stand, behind, product, w...",good usa company stand behind product warranty...
1,A32JCI4AK2JTTG,B00002N674,"Darryl Bennett ""Fuzzy342""","[0, 0]",This is a high quality 8 ply hose. I have had ...,5.0,Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch ...,1402272000,"06 9, 2014","[high, quality, ply, hose, good, luck, gilmour...",high quality ply hose good luck gilmour hose p...


In [5]:
count_matrix, tfidf_matrix, vocabulary, words = my_utils.processReviews(dataset['text'].values)

In [6]:
%%time
embeddings_index = gensim.models.KeyedVectors.load_word2vec_format("nongit_resources/wiki-news-300d-1M.vec")

CPU times: user 4min 36s, sys: 3.43 s, total: 4min 39s
Wall time: 4min 39s


In [7]:
words_embeddings = {}
for i in words:
    try:
        words_embeddings[i] = embeddings_index[i]
    except:
        pass

In [8]:
embeddings_index = None

In [9]:
len(words_embeddings)

1992

In [10]:
words_with_embeddings = words_embeddings.keys()

In [11]:
%%time
edge_embeds_multi = []
for i, j in combinations(words_with_embeddings, 2):
    edge_embeds_multi.append((words_embeddings[i], words_embeddings[j]))

CPU times: user 618 ms, sys: 67.9 ms, total: 686 ms
Wall time: 684 ms


In [12]:
len(edge_embeds_multi)

1983036

In [13]:
%%time
pool = multiprocessing.Pool()
embeddings_cosines = pool.map(my_utils.get_cosine_multi, edge_embeds_multi)
pool.close()

CPU times: user 6.51 s, sys: 4.5 s, total: 11 s
Wall time: 16.3 s


In [14]:
len(embeddings_cosines)

1983036

In [15]:
%%time
edge_embeddings = {}
for idx, (i, j) in enumerate(combinations(words_with_embeddings, 2)):
    edge_embeddings[(i, j)] = embeddings_cosines[idx]
    edge_embeddings[(j, i)] = embeddings_cosines[idx]

CPU times: user 1.91 s, sys: 396 ms, total: 2.31 s
Wall time: 2.31 s


In [16]:
def get_edges_per_doc(doc):
    edges, edges_all = [], []
    for i in doc:
        for j in doc:
            if i != j and i in words_with_embeddings and j in words_with_embeddings:
                sim = edge_embeddings[(i, j)]
                if sim > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
                    edges.append((vocabulary[i], vocabulary[j]))
                    edges_all.append((i, j, sim))
    return (edges, edges_all)

In [17]:
edges_threshold = 0.45

In [18]:
%%time
pool = multiprocessing.Pool(30)
docs_edges_multi = pool.map(get_edges_per_doc, dataset['cleaned'].values)
pool.close()

CPU times: user 10.3 s, sys: 3.97 s, total: 14.3 s
Wall time: 12min 8s


In [19]:
docs_edges = [i[0] for i in docs_edges_multi]

In [20]:
docs_edges_all = [i[1] for i in docs_edges_multi]

In [21]:
dataset['text'].values[1]

u'high quality ply hose good luck gilmour hose past good choice hose'

In [22]:
docs_edges_all[1]

[(u'high', u'quality', 0.5195401906967163),
 (u'high', u'good', 0.5427482724189758),
 (u'quality', u'good', 0.5396069288253784),
 (u'quality', u'choice', 0.5316603779792786),
 (u'good', u'luck', 0.554854154586792),
 (u'luck', u'choice', 0.46588897705078125)]

In [23]:
np.median([len(i) for i in docs_edges])

100.0

In [24]:
np.mean([len(i) for i in docs_edges])

241.0197631439994

In [25]:
np.where(np.array([len(i) for i in docs_edges]) == 0)[0].shape

(34,)

In [26]:
Counter([len(i) for i in docs_edges])

Counter({0: 34,
         1: 61,
         2: 70,
         3: 113,
         4: 104,
         5: 116,
         6: 121,
         7: 122,
         8: 135,
         9: 126,
         10: 138,
         11: 129,
         12: 126,
         13: 116,
         14: 124,
         15: 114,
         16: 123,
         17: 107,
         18: 99,
         19: 96,
         20: 94,
         21: 94,
         22: 98,
         23: 87,
         24: 90,
         25: 94,
         26: 96,
         27: 89,
         28: 80,
         29: 82,
         30: 73,
         31: 72,
         32: 68,
         33: 73,
         34: 80,
         35: 60,
         36: 78,
         37: 69,
         38: 69,
         39: 61,
         40: 74,
         41: 69,
         42: 59,
         43: 68,
         44: 68,
         45: 58,
         46: 53,
         47: 60,
         48: 60,
         49: 59,
         50: 65,
         51: 52,
         52: 66,
         53: 52,
         54: 46,
         55: 57,
         56: 54,
         57: 54,
         

In [27]:
pickle_out = open("resources/edges_amazon_patio_fasttext_nontrained.pickle","wb")
pickle.dump(docs_edges, pickle_out)
pickle_out.close()

# Appendix

In [28]:
# %%time
# edges_threshold = 0.73
# docs_edges = []
# docs_edges_all = []
# for idx, doc in enumerate(dataset[10].values):
#     my_utils.print_if_mod(idx, 500)
#     edges, edges_all = [], []
#     for i in doc:
#         for j in doc:
#             if i != j:
#                 sim = edge_embeddings[(i, j)]
#                 if sim > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
#                     edges.append((vocabulary[i], vocabulary[j]))
#                     edges_all.append((i, j, sim))
#     docs_edges.append(edges)
#     docs_edges_all.append(edges_all)

In [29]:
# def loadGloveModel(gloveFile):
#     print("Loading Glove Model")
#     f = open(gloveFile,'r')
#     model = {}
#     for line in f:
#         splitLine = line.split()
#         word = splitLine[0]
#         embedding = np.array([float(val) for val in splitLine[1:]])
#         model[word] = embedding
#     print("Done.",len(model)," words loaded!")
#     return model

In [30]:
# %%time
# embeddings_index = loadGloveModel("nongit_resources/glove.42B.300d.txt")

In [31]:
# %%time
# embeddings_index = gensim.models.KeyedVectors.load_word2vec_format("nongit_resources/electronics.vec")

In [32]:
# %%time
# edges_threshold = 0.8
# docs_edges, ignored, taken, count = [], [], [], 0
# for idx, doc in enumerate(dataset[8].values):
#     edges = []
#     print(idx)
#     for i in doc:
#         for j in doc:
#             if i != j:
#                 try:
#                     a = embeddings_index[i]
#                     b = embeddings_index[j]
#                     if get_cosine(a, b) > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
#                         edges.append((vocabulary[i], vocabulary[j]))
#                 except:
#                     try:
#                         embeddings_index[i]
#                         taken.append(i)
#                     except:
#                         ignored.append(i)
#                     try:
#                         embeddings_index[j]
#                     except:
#                         ignored.append(j)
#                         taken.append(j)
#                     pass
#     docs_edges.append(edges)

In [33]:
# pickle_out = open("resources/docs_edges_" + dataset_name + "_5k_fasttext_trained.pickle","wb")
# pickle.dump(docs_edges, pickle_out)
# pickle_out.close()

In [34]:
## Sentence wise
# dataset = dataset[['asin', 'helpful', 'overall', 'reviewText']]
# dataset['n_words'] = dataset['reviewText'].apply(lambda x: len(w_tokenizer.tokenize(x)))
# dataset['sentences'] = dataset['reviewText'].apply(lambda x: [i.strip() for i in x.split(".")])
# dataset['sentence_word_density'] = dataset['reviewText'].apply(lambda x: len(w_tokenizer.tokenize(x))/ len(x.split(".")))
# dataset.to_csv("reviews_Musical_Instruments_5.csv")

In [35]:
# edge_dict, ignored, taken, count = {}, [], [], 0
# for idxi, i in enumerate(vocabulary.keys()):
#     print(idxi)
#     for idxj, j in enumerate(vocabulary.keys()):
#         if i != j:
#             try:
#                 a = embeddings_index[i]
#                 b = embeddings_index[j]
#                 if get_cosine(a, b) > edges_threshold:
#                     try:
#                         edge_dict[vocabulary[i]] += [vocabulary[j]]
#                         edge_dict[vocabulary[j]] += [vocabulary[i]]
#                     except:
#                         edge_dict[vocabulary[i]] = [vocabulary[j]]
#                         edge_dict[vocabulary[j]] = [vocabulary[i]]
#             except:
#                 try:
#                     embeddings_index[i]
#                     taken.append(i)
#                 except:
#                     ignored.append(i)
#                 try:
#                     embeddings_index[j]
#                 except:
#                     ignored.append(j)
#                     taken.append(j)
#                 pass

In [36]:
# df = {}

# for idx, i in enumerate(dataset[8].values):
#     print(idx)
#     for j in i:
#         try:
#             df[j] += [idx]
#         except:
#             df[j] = [idx]

# for i in df.keys():
#     df[i] = len(list(set(df[i])))

# df_vector = []
# for i in dataset[8].values:
#     d = [0]*len(vocabulary.keys())
#     for j in i:
#         if j in vocabulary.keys():
#             d[vocabulary[j]] = df[j]
#     df_vector.append(d)

# csr = sparse.csr_matrix(np.array(df_vector))
# scipy.sparse.save_npz('resources/df_stackoverflow_5kanswers.npz', csr)

In [37]:
# np.array(scipy.sparse.load_npz('resources/df_stackoverflow_5kanswers.npz').todense())

In [38]:
# dataset = parse("nongit_resources/reviews_Electronics_5.json.gz")
# dataset = pd.DataFrame(list(dataset))
# dataset = dataset.head(N_docs)
# dataset.to_pickle("resources/reviews_Electronics_5")