### Imports

In [1]:
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bert_embedding import BertEmbedding
from allennlp.commands.elmo import ElmoEmbedder

import imp, gzip
import pickle, nltk
import gensim
import multiprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils as my_utils

### Definitions

In [2]:
def get_edges(i):
    t = np.where(i>0)[0]
    comb = combinations(t, 2)
    embeds = {j:[] for j in t}

    for p, q in comb:
        if word_similarity[p][q]:
            embeds[p] += [q]
            embeds[q] += [p]
    return embeds

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in tqdm(parse(path)):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
def process_df(df):
    df['text'] = my_utils.preprocess(df['text'])
    return df

In [5]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding='utf8')
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [6]:
def get_edges_transformers(text):
    sentence = text.split(" ")

    if embedding_name == 'bert':
        results = bert_embedding(sentence)
        embed_vecs = np.array([i[1][0] for i in results])
    else:
        embed_vecs = elmo.embed_sentence(sentence)[2]

    l = np.array(list(set(sentence).intersection(words)))

    pp = np.array([i[1] for i in nltk.pos_tag(l)])
    pp[pp=='JJ'] = 1
    pp[pp=='JJR'] = 1
    pp[pp=='JJS'] = 1
    pp[pp=='NN'] = 1
    pp[pp=='NNS'] = 1
    pp[pp=='NNP'] = 1
    pp[pp=='NNPS'] = 1
    pp[pp!='1'] = 0
    pp = pp.astype(int)

    l = l[pp==1]

    word_embeddings = np.array([embed_vecs[sentence.index(i)] for i in l])

    word_similarity = cosine_similarity(word_embeddings)

    remove = np.where(word_similarity == 1)

    for i, j in zip(remove[0], remove[1]):
        word_similarity[i][j] = 0
        word_similarity[j][i] = 0

    word_similarity = word_similarity > cutoff
    word_similarity = word_similarity.astype(int)
    np.fill_diagonal(word_similarity, 0)

    inds = np.where(word_similarity==1)

    embeds = {words.index(j):[] for j in l}

    for i, j in zip(inds[0], inds[1]):
        embeds[words.index(l[i])] += [words.index(l[j])]

    return embeds

### Config

In [7]:
dataset_name = "amazon_electronics"

min_df = 5
max_df = .5
max_features = 50000
cutoffs = [0.3, 0.6]

n_cores = 40
n_docs = 100000

### Start

In [8]:
dataset_ = getDF('datasets/reviews_Electronics_5.json.gz')
dataset_.shape

1689188it [01:33, 18153.27it/s]


(1689188, 9)

In [9]:
dataset = dataset_.sample(n_docs*3)

In [10]:
dataset = dataset.drop(columns=['reviewerID', 'asin', 'reviewerName', 'helpful', 'summary', 'unixReviewTime', 'reviewTime'])

In [11]:
dataset = dataset.rename(columns={'reviewText': 'text', 'overall': 'sentiment'})

In [12]:
n = int(dataset.shape[0]/n_cores)
list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

dataset = pd.concat(processed_list_df)
dataset.shape

(300000, 2)

In [15]:
dataset = dataset[dataset.text.apply(lambda x: len(x.split(" "))>5 and len(x.split(" "))<200)].sample(n_docs).reset_index().drop(columns='index')

In [16]:
dataset.to_pickle("resources/"+ dataset_name + "_" + str(n_docs) + "_dataset")

In [17]:
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
                             stop_words="english", max_features=max_features,
                             max_df=max_df, min_df=min_df)

In [18]:
wordOccurenceMatrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()
words = vectorizer.get_feature_names()

In [19]:
len(words)

11387

# Embeddings

### Glove

In [None]:
glove_embedding_dim = 300
glove_embeddings_index = loadGloveModel("nongit_resources/glove.6B.300d.txt")

In [None]:
len(words)

In [None]:
glove_word_embeddings = []

for word in tqdm(words):
    emb = glove_embeddings_index.get(word, np.array([0]*glove_embedding_dim))
    glove_word_embeddings.append(emb.tolist())

glove_word_embeddings = np.array(glove_word_embeddings)

In [None]:
g = ['glove', glove_word_embeddings]

### Fasttext

In [None]:
%%time
fasttext_embedding_dim = 300
fasttext_embeddings_index = gensim.models.KeyedVectors.load_word2vec_format("nongit_resources/wiki-news-300d-1M.vec")

In [None]:
fasttext_word_embeddings = []

for word in tqdm(words):
    emb = np.array([0]*glove_embedding_dim)
    try:
        emb = fasttext_embeddings_index[word]
    except:
        pass
    fasttext_word_embeddings.append(emb.tolist())

fasttext_word_embeddings = np.array(fasttext_word_embeddings)

In [None]:
f = ['fasttext', fasttext_word_embeddings]

#### Grid

In [None]:
for embedding_name, word_embeddings in [g, f]:
    for cutoff in cutoffs:
        print(embedding_name, cutoff)
        word_similarity = cosine_similarity(word_embeddings)

        remove = np.where(word_similarity == 1)

        for i, j in zip(remove[0], remove[1]):
            word_similarity[i][j] = 0
            word_similarity[j][i] = 0

        word_similarity = word_similarity > cutoff
        word_similarity = word_similarity.astype(int)
        np.fill_diagonal(word_similarity, 0)

        wordOccuranceMatrixBinary = wordOccurenceMatrix.copy()
        wordOccuranceMatrixBinary[wordOccuranceMatrixBinary > 1] = 1

        ### POS
        pp = np.array([i[1] for i in nltk.pos_tag(words)])
        pp[pp=='JJ'] = 1
        pp[pp=='JJR'] = 1
        pp[pp=='JJS'] = 1
        pp[pp=='NN'] = 1
        pp[pp=='NNS'] = 1
        pp[pp=='NNP'] = 1
        pp[pp=='NNPS'] = 1
        pp[pp!='1'] = 0
        pp = pp.astype(int)

        wordOccuranceMatrixBinary[:, np.where(pp!=1)[0]] = 0

        pool = multiprocessing.Pool(n_cores)
        similar_words = pool.map(get_edges, wordOccuranceMatrixBinary)
        pool.close()
        pickle_out = open("resources/"+ dataset_name + "_" + str(n_docs) +"_" + embedding_name + "_" + str(cutoff) + ".pickle","wb")
        pickle.dump(similar_words, pickle_out)
        pickle_out.close()

## Bert & Elmo

In [20]:
bert_embedding = BertEmbedding()
elmo = ElmoEmbedder()

In [None]:
%%time
for embedding_name in ['bert', 'elmo']:
    for cutoff in cutoffs:
        print(embedding_name, cutoff)
        pool = multiprocessing.Pool(n_cores)
        similar_words = pool.map(get_edges_transformers, dataset.text.tolist())
        pool.close()
        pickle_out = open("resources/"+ dataset_name + "_" + str(n_docs) + "_" + embedding_name + "_" + str(cutoff) + ".pickle","wb")
        pickle.dump(similar_words, pickle_out)
        pickle_out.close()

In [27]:
embedding_name = 'elmo'

In [28]:
cutoff = 0.6

In [None]:
%%time
pool = multiprocessing.Pool(n_cores)
similar_words = pool.map(get_edges_transformers, dataset.text.tolist()[:100])
pool.close()

### Appendix

In [None]:
# wordOccuranceMatrixBinary[0].sum()

# np.sum(wordOccuranceMatrixBinary)

# Counter(np.array([i[1] for i in nltk.pos_tag(words)]))

# pp.sum()

# np.where(pp!=1)[0].shape

In [None]:
#     pd = pd.apply(lambda x: convert_numbers(x))

In [None]:
# def process_df(df):
#     df['text'] = preprocess(df['reviewText'])
    
# #     pool = multiprocessing.Pool(n_cores)
# #     df['cleaned'] = pool.map(process_l, df['text'].tolist())
# #     pool.close()
    
# #     df['text'] = df['cleaned'].apply(lambda x: " ".join(x))
#     return df

In [None]:
# p = [item for sublist in dataset['cleaned'].tolist() for item in sublist]

In [None]:
# sorted(Counter(p))

In [None]:
# def process_l(s):
#     return [i.lemma_ for i in sp(s) if i.lemma_ not in '-PRON-']

In [None]:
# l = dataset['text'].tolist()

In [None]:
# pool = multiprocessing.Pool(n_cores)
# processed_l = pool.map(process_l, l)
# pool.close()

In [None]:
# joblib.dump(sampler, "resources/sampler_20iter_0.5_1")

In [None]:
# pickle_out = open("resources/amazon_muiscal_glove_0.4.pickle","wb")
# pickle.dump(similar_words, pickle_out)
# pickle_out.close()