#### Imports

In [1]:
from scipy import spatial, sparse
from scipy.stats import chi2
from collections import Counter
from num2words import num2words
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.externals import joblib 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import os
import imp
import gzip
import copy
import nltk
import pickle
import scipy
import string
import gensim
import operator
import datetime

import numpy as np
import pandas as pd
import LDA_ILJST as lda
import matplotlib.pyplot as plt

In [2]:
import utils as my_utils

### Required Methods

In [55]:
dataset_name = "stackoverflow_fasttext_trained_with_stopwords"

In [4]:
dataset = pd.read_pickle("resources/data_stackoverflow_5kanswers_pd")

In [5]:
dataset.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body,body_text,cleaned_body,clean,sent_score,sent_score_quant
13,783,189.0,2008-08-03T18:40:09Z,773,52,<p>Can you show us your code?</p>\n\n<p>The ex...,Can you show us your code? \n The example o...,"[show, code, example, python, doc, quite, stra...",show code example python doc quite straightfor...,0.030571,5
71,7286,207.0,2008-08-10T18:45:32Z,773,394,"<p>As Sebastjan said, <strong>you first have t...","As Sebastjan said, you first have to sort ...","[sebastjan, said, first, sort, data, important...",sebastjan said first sort data important part ...,0.024786,4
350,37252,3926.0,2008-08-31T23:27:16Z,773,24,<p>A neato trick with groupby is to run length...,A neato trick with groupby is to run length...,"[neato, trick, groupby, run, length, encoding,...",neato trick groupby run length encoding one li...,-0.039139,1
33091,1573195,83284.0,2009-10-15T15:41:51Z,773,5,"<p>@CaptSolo, I tried your example, but it did...","@CaptSolo, I tried your example, but it did...","[captsolo, tried, example, work, output, see, ...",captsolo tried example work output see two two...,0.039564,5
309519,14443477,650654.0,2013-01-21T16:54:08Z,773,15,"<p>Another example:</p>\n\n<pre><code>for key,...",Another example: \n \n results in \n \n N...,"[another, example, result, note, igroup, itera...",another example result note igroup iterator su...,0.020411,4


In [6]:
dataset = dataset.rename(columns={'cleaned_body': 8, 'clean': 9})

In [7]:
dataset.head(2)

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body,body_text,8,9,sent_score,sent_score_quant
13,783,189.0,2008-08-03T18:40:09Z,773,52,<p>Can you show us your code?</p>\n\n<p>The ex...,Can you show us your code? \n The example o...,"[show, code, example, python, doc, quite, stra...",show code example python doc quite straightfor...,0.030571,5
71,7286,207.0,2008-08-10T18:45:32Z,773,394,"<p>As Sebastjan said, <strong>you first have t...","As Sebastjan said, you first have to sort ...","[sebastjan, said, first, sort, data, important...",sebastjan said first sort data important part ...,0.024786,4


In [8]:
count_matrix_, tfidf_matrix_, vocabulary, words = my_utils.processReviews(dataset[9].values)

In [9]:
len(vocabulary)

2000

In [10]:
dataset[10] = dataset[8].apply(lambda x: [i for i in x if i in words])
dataset[11] = dataset[10].apply(lambda x: " ".join(x))

In [11]:
dataset.head(2)

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body,body_text,8,9,sent_score,sent_score_quant,10,11
13,783,189.0,2008-08-03T18:40:09Z,773,52,<p>Can you show us your code?</p>\n\n<p>The ex...,Can you show us your code? \n The example o...,"[show, code, example, python, doc, quite, stra...",show code example python doc quite straightfor...,0.030571,5,"[show, code, example, python, doc, quite, stra...",show code example python doc quite straightfor...
71,7286,207.0,2008-08-10T18:45:32Z,773,394,"<p>As Sebastjan said, <strong>you first have t...","As Sebastjan said, you first have to sort ...","[sebastjan, said, first, sort, data, important...",sebastjan said first sort data important part ...,0.024786,4,"[said, first, sort, data, important, part, get...",said first sort data important part get exampl...


In [12]:
%%time
embeddings_index = gensim.models.KeyedVectors.load_word2vec_format("nongit_resources/wiki_news_stackoverflow_with_stopwords.vec")

CPU times: user 5min, sys: 3.24 s, total: 5min 3s
Wall time: 5min 3s


In [13]:
words_embeddings = {}
for i in words:
    try:
        words_embeddings[i] = embeddings_index[i]
    except:
        pass

In [14]:
embeddings_index = None

In [15]:
len(words_embeddings)

1999

In [16]:
%%time
edge_embeddings = {}
for idx, i in enumerate(words):
    my_utils.print_if_mod(idx, 500)
    for j in words:
        if (i, j) not in edge_embeddings:
            try:
                sim = my_utils.get_cosine(words_embeddings[i], words_embeddings[j])
                edge_embeddings[(i, j)] = sim
                edge_embeddings[(j, i)] = sim
            except:
                edge_embeddings[(i, j)] = 0.0
                edge_embeddings[(j, i)] = 0.0
                pass

0
500
1000
1500
CPU times: user 1min 29s, sys: 530 ms, total: 1min 29s
Wall time: 1min 29s


In [57]:
%%time
edges_threshold = 0.73
docs_edges = []
docs_edges_all = []
for idx, doc in enumerate(dataset[10].values):
    my_utils.print_if_mod(idx, 500)
    edges, edges_all = [], []
    for i in doc:
        for j in doc:
            if i != j:
                sim = edge_embeddings[(i, j)]
                if sim > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
                    edges.append((vocabulary[i], vocabulary[j]))
                    edges_all.append((i, j, sim))
    docs_edges.append(edges)
    docs_edges_all.append(edges_all)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
CPU times: user 23.8 s, sys: 661 ms, total: 24.5 s
Wall time: 24.5 s


In [58]:
dataset[10].values[1]

[u'said',
 u'first',
 u'sort',
 u'data',
 u'important',
 u'part',
 u'get',
 u'example',
 u'current',
 u'key',
 u'iterator',
 u'use',
 u'iterate',
 u'group',
 u'defined',
 u'key',
 u'word',
 u'iterator',
 u'return',
 u'iterators',
 u'example',
 u'using',
 u'clearer',
 u'variable',
 u'name',
 u'give',
 u'output',
 u'br',
 u'duck',
 u'speed',
 u'br',
 u'example',
 u'list',
 u'tuples',
 u'first',
 u'item',
 u'tuple',
 u'group',
 u'second',
 u'item',
 u'function',
 u'take',
 u'two',
 u'argument',
 u'data',
 u'group',
 u'function',
 u'group',
 u'tell',
 u'use',
 u'first',
 u'item',
 u'tuple',
 u'key',
 u'statement',
 u'return',
 u'three',
 u'key',
 u'group',
 u'iterator',
 u'pair',
 u'unique',
 u'key',
 u'use',
 u'returned',
 u'iterator',
 u'iterate',
 u'individual',
 u'item',
 u'group',
 u'slightly',
 u'different',
 u'example',
 u'data',
 u'using',
 u'list',
 u'comprehension',
 u'give',
 u'output',
 u'duck',
 u'br',
 u'br',
 u'speed']

In [59]:
docs_edges_all[1]

[(u'first', u'part', 0.7327466607093811),
 (u'first', u'get', 0.739512026309967),
 (u'first', u'use', 0.743014931678772),
 (u'first', u'return', 0.7347083687782288),
 (u'first', u'using', 0.7356998324394226),
 (u'first', u'second', 0.8564392328262329),
 (u'first', u'two', 0.7779945135116577),
 (u'first', u'three', 0.7909581661224365),
 (u'important', u'part', 0.7379785180091858),
 (u'get', u'example', 0.7321568727493286),
 (u'get', u'use', 0.7653287053108215),
 (u'get', u'return', 0.7760693430900574),
 (u'get', u'using', 0.7799131870269775),
 (u'get', u'give', 0.8506007194519043),
 (u'get', u'output', 0.7354133725166321),
 (u'get', u'returned', 0.7635136246681213),
 (u'example', u'use', 0.7419353723526001),
 (u'example', u'using', 0.7359979748725891),
 (u'iterator', u'iterate', 0.7447704076766968),
 (u'iterator', u'iterators', 0.8952702879905701),
 (u'use', u'using', 0.8878074884414673),
 (u'use', u'give', 0.7515801191329956),
 (u'iterate', u'list', 0.7436172962188721),
 (u'return', u'

In [60]:
np.median([len(i) for i in docs_edges])

18.0

In [61]:
np.mean([len(i) for i in docs_edges])

42.40994408945687

In [62]:
np.where(np.array([len(i) for i in docs_edges]) == 0)[0].shape

(65,)

In [63]:
Counter([len(i) for i in docs_edges])

Counter({0: 65,
         1: 121,
         2: 119,
         3: 159,
         4: 162,
         5: 158,
         6: 169,
         7: 126,
         8: 154,
         9: 108,
         10: 202,
         11: 173,
         12: 130,
         13: 147,
         14: 135,
         15: 133,
         16: 95,
         17: 107,
         18: 112,
         19: 90,
         20: 73,
         21: 75,
         22: 86,
         23: 57,
         24: 67,
         25: 65,
         26: 50,
         27: 66,
         28: 58,
         29: 55,
         30: 46,
         31: 40,
         32: 55,
         33: 40,
         34: 27,
         35: 32,
         36: 32,
         37: 32,
         38: 36,
         39: 33,
         40: 29,
         41: 33,
         42: 27,
         43: 31,
         44: 19,
         45: 24,
         46: 25,
         47: 24,
         48: 20,
         49: 17,
         50: 21,
         51: 16,
         52: 23,
         53: 15,
         54: 25,
         55: 15,
         56: 16,
         57: 13,
       

In [64]:
dataset_name

'stackoverflow_fasttext_trained_with_stopwords'

In [65]:
pickle_out = open("resources/docs_edges_" + dataset_name + "_5kanswers.pickle","wb")
pickle.dump(docs_edges, pickle_out)
pickle_out.close()

# Appendix

In [None]:
# def loadGloveModel(gloveFile):
#     print("Loading Glove Model")
#     f = open(gloveFile,'r')
#     model = {}
#     for line in f:
#         splitLine = line.split()
#         word = splitLine[0]
#         embedding = np.array([float(val) for val in splitLine[1:]])
#         model[word] = embedding
#     print("Done.",len(model)," words loaded!")
#     return model

In [None]:
# %%time
# embeddings_index = loadGloveModel("nongit_resources/glove.42B.300d.txt")

In [None]:
# %%time
# embeddings_index = gensim.models.KeyedVectors.load_word2vec_format("nongit_resources/electronics.vec")

In [None]:
# %%time
# edges_threshold = 0.8
# docs_edges, ignored, taken, count = [], [], [], 0
# for idx, doc in enumerate(dataset[8].values):
#     edges = []
#     print(idx)
#     for i in doc:
#         for j in doc:
#             if i != j:
#                 try:
#                     a = embeddings_index[i]
#                     b = embeddings_index[j]
#                     if get_cosine(a, b) > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
#                         edges.append((vocabulary[i], vocabulary[j]))
#                 except:
#                     try:
#                         embeddings_index[i]
#                         taken.append(i)
#                     except:
#                         ignored.append(i)
#                     try:
#                         embeddings_index[j]
#                     except:
#                         ignored.append(j)
#                         taken.append(j)
#                     pass
#     docs_edges.append(edges)

In [None]:
# pickle_out = open("resources/docs_edges_" + dataset_name + "_5k_fasttext_trained.pickle","wb")
# pickle.dump(docs_edges, pickle_out)
# pickle_out.close()

In [None]:
## Sentence wise
# dataset = dataset[['asin', 'helpful', 'overall', 'reviewText']]
# dataset['n_words'] = dataset['reviewText'].apply(lambda x: len(w_tokenizer.tokenize(x)))
# dataset['sentences'] = dataset['reviewText'].apply(lambda x: [i.strip() for i in x.split(".")])
# dataset['sentence_word_density'] = dataset['reviewText'].apply(lambda x: len(w_tokenizer.tokenize(x))/ len(x.split(".")))
# dataset.to_csv("reviews_Musical_Instruments_5.csv")

In [None]:
# edge_dict, ignored, taken, count = {}, [], [], 0
# for idxi, i in enumerate(vocabulary.keys()):
#     print(idxi)
#     for idxj, j in enumerate(vocabulary.keys()):
#         if i != j:
#             try:
#                 a = embeddings_index[i]
#                 b = embeddings_index[j]
#                 if get_cosine(a, b) > edges_threshold:
#                     try:
#                         edge_dict[vocabulary[i]] += [vocabulary[j]]
#                         edge_dict[vocabulary[j]] += [vocabulary[i]]
#                     except:
#                         edge_dict[vocabulary[i]] = [vocabulary[j]]
#                         edge_dict[vocabulary[j]] = [vocabulary[i]]
#             except:
#                 try:
#                     embeddings_index[i]
#                     taken.append(i)
#                 except:
#                     ignored.append(i)
#                 try:
#                     embeddings_index[j]
#                 except:
#                     ignored.append(j)
#                     taken.append(j)
#                 pass

In [None]:
df = {}

for idx, i in enumerate(dataset[8].values):
    print(idx)
    for j in i:
        try:
            df[j] += [idx]
        except:
            df[j] = [idx]

for i in df.keys():
    df[i] = len(list(set(df[i])))

df_vector = []
for i in dataset[8].values:
    d = [0]*len(vocabulary.keys())
    for j in i:
        if j in vocabulary.keys():
            d[vocabulary[j]] = df[j]
    df_vector.append(d)

csr = sparse.csr_matrix(np.array(df_vector))
scipy.sparse.save_npz('resources/df_stackoverflow_5kanswers.npz', csr)

In [None]:
np.array(scipy.sparse.load_npz('resources/df_stackoverflow_5kanswers.npz').todense())

In [None]:
# dataset = parse("nongit_resources/reviews_Electronics_5.json.gz")
# dataset = pd.DataFrame(list(dataset))
# dataset = dataset.head(N_docs)
# dataset.to_pickle("resources/reviews_Electronics_5")