### Imports

In [1]:
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import imp, gzip
import pickle, nltk
import gensim
import multiprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils as my_utils

### Definitions

In [2]:
def get_edges(i):
    t = np.where(i>0)[0]
    comb = combinations(t, 2)    
    embeds = {j:[] for j in t}

    for p, q in comb:
        if word_similarity[p][q]:
            embeds[p] += [q]
            embeds[q] += [p]
    return embeds

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in tqdm(parse(path)):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
def process_df(df):
    df['text'] = my_utils.preprocess(df['reviewText'])
    return df

In [5]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding='utf8')
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

### Config

In [6]:
cutoff = .35
title = "electronics_glove_random_500"

min_df = 5
max_df = .5
max_features = 50000

n_cores = 30
n_docs = 500

### Start

In [None]:
dataset_ = getDF('datasets/reviews_Electronics_5.json.gz')
dataset_.shape

1689188it [01:28, 19176.09it/s]


In [None]:
embedding_dim = 300
embeddings_index = loadGloveModel("nongit_resources/glove.6B.300d.txt")

In [None]:
stop_words = stopwords.words('english')

In [None]:
dataset = dataset_.sample(n_docs)

In [None]:
n = int(dataset.shape[0]/n_cores)
list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

dataset = pd.concat(processed_list_df)
dataset.shape

In [None]:
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
                             stop_words="english", max_features=max_features,
                             max_df=max_df, min_df=min_df)

In [None]:
wordOccurenceMatrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()
words = vectorizer.get_feature_names()

In [None]:
word_embeddings = []

for word in tqdm(words):
    emb = embeddings_index.get(word, np.array([0]*embedding_dim))
    word_embeddings.append(emb.tolist())

word_embeddings = np.array(word_embeddings)

In [None]:
word_similarity = cosine_similarity(word_embeddings)
word_similarity = word_similarity > cutoff
word_similarity = word_similarity.astype(int)
np.fill_diagonal(word_similarity, 0)

In [None]:
wordOccuranceMatrixBinary = wordOccurenceMatrix.copy()
wordOccuranceMatrixBinary[wordOccuranceMatrixBinary > 1] = 1
wordOccuranceMatrixBinary.shape

### POS

In [None]:
pp = np.array([i[1] for i in nltk.pos_tag(words)])
pp[pp=='JJ'] = 1
pp[pp=='JJR'] = 1
pp[pp=='JJS'] = 1
pp[pp=='NN'] = 1
pp[pp=='NNS'] = 1
pp[pp=='NNP'] = 1
pp[pp=='NNPS'] = 1
pp[pp!='1'] = 0
pp = pp.astype(int)

In [None]:
wordOccuranceMatrixBinary[:, np.where(pp!=1)[0]] = 0

In [None]:
%%time
pool = multiprocessing.Pool(n_cores)
similar_words = pool.map(get_edges, wordOccuranceMatrixBinary)
pool.close()
pickle_out = open("resources/"+ title +"_" + str(cutoff) + ".pickle","wb")
pickle.dump(similar_words, pickle_out)
pickle_out.close()

In [None]:
dataset.to_pickle("resources/"+ title + "_" + str(cutoff) +"_dataset")

### Appendix

In [None]:
# wordOccuranceMatrixBinary[0].sum()

# np.sum(wordOccuranceMatrixBinary)

# Counter(np.array([i[1] for i in nltk.pos_tag(words)]))

# pp.sum()

# np.where(pp!=1)[0].shape

In [None]:
#     pd = pd.apply(lambda x: convert_numbers(x))

In [None]:
# def process_df(df):
#     df['text'] = preprocess(df['reviewText'])
    
# #     pool = multiprocessing.Pool(n_cores)
# #     df['cleaned'] = pool.map(process_l, df['text'].tolist())
# #     pool.close()
    
# #     df['text'] = df['cleaned'].apply(lambda x: " ".join(x))
#     return df

In [None]:
# p = [item for sublist in dataset['cleaned'].tolist() for item in sublist]

In [None]:
# sorted(Counter(p))

In [None]:
# def process_l(s):
#     return [i.lemma_ for i in sp(s) if i.lemma_ not in '-PRON-']

In [None]:
# l = dataset['text'].tolist()

In [None]:
# pool = multiprocessing.Pool(n_cores)
# processed_l = pool.map(process_l, l)
# pool.close()

In [None]:
# joblib.dump(sampler, "resources/sampler_20iter_0.5_1")

In [None]:
# pickle_out = open("resources/amazon_muiscal_glove_0.4.pickle","wb")
# pickle.dump(similar_words, pickle_out)
# pickle_out.close()