In [4]:
from gensim.models.keyedvectors import KeyedVectors
# should load in 2 min
w2v = KeyedVectors.load_word2vec_format('../../Tools/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [6]:
import sys
sys.path.append("../../Code")
import opinion_lexicon as op

In [57]:
w2v.most_similar(positive=['wholeheartedly', 'negative'], topn = 5)

[('whole_heartedly', 0.6429222226142883),
 ('positive', 0.6418464779853821),
 ('postive', 0.5671218037605286),
 ('strongly', 0.5196532607078552),
 ('negativity', 0.5102083683013916)]

In [67]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import pandas as pd
import os
import sys
sys.path.append("../../Code")
import utils
stopWords = set(stopwords.words('english'))

In [65]:
reviews = pd.read_csv(os.path.join('..','2','len_lang_restrict_reviews.csv'), index_col=0)

In [96]:
def pipeline(reviews, stopwords=[], vocab=None):
    reviews_encoded = {}
    vocab_counts = {}
    vocab_doc_count = {}

    is_ext_vocab = True
    if vocab is None:
        is_ext_vocab = False
        vocab = {'<OOV>': 0}
    for key in reviews.keys():
        r = []
        tokenized_r = utils.tokenize(reviews[key].lower())
        for token in tokenized_r:
            #token = ps.stem(token)
            if token in stopwords:
                continue
            if not is_ext_vocab and token not in vocab:
                vocab[token] = len(vocab)
                vocab_counts[token] = 1
            if token not in vocab:
                token_id = vocab['<OOV>']
                vocab_counts['<OOV>'] += 1
            else:
                token_id = vocab[token]
                vocab_counts[token] += 1
            r.append(token_id)
        reviews_encoded[key] = r
    idf_dict = []
    for key in vocab.keys():
        if key in vocab_doc_count.keys():
            idf_dict.append(np.log10(N/vocab_doc_count[key]))
        else:
            idf_dict.append(0)
    return reviews_encoded, vocab_counts, vocab, idf_dict

In [97]:
r, vc, v, idf = pipeline(reviews['comments'], stopWords)

In [98]:
def create_vocab_count_table(counts):
    return pd.DataFrame({'counts':[counts[k] for k in counts.keys()], 'word':[k for k in counts.keys()]})

In [99]:
vocab_table = create_vocab_count_table(vc)

In [115]:
vocab_table.sort_values(by='counts', ascending=False).tail(5)

Unnamed: 0,counts,word
34859,2,restorant
34858,2,studiu
34857,2,wellwas
34856,2,maintness
56858,2,quelques


In [102]:
vocab_set = set(v.keys())

In [103]:
len(op.negative_words_set)

4783

In [104]:
len(op.positive_words_set)

2007

In [105]:
op_words = op.negative_words_set.union(op.positive_words_set)

In [106]:
op_int_words =op.negative_words_set.intersection(op.positive_words_set)

In [107]:
op_int_words

{'envious', 'enviously', 'enviousness'}

In [108]:
op_words = op_words - op_int_words

In [109]:
len(op_words)

6784

In [110]:
print("How many of op words are in vocab:", format(len(vocab_set.intersection(op_words))*100/len(op_words), ".2f"), "%")

How many of op words are in vocab: 51.02 %


In [111]:
print("How many of vocab words are in opwordset:", format(len(vocab_set.intersection(op_words))*100/len(vocab_set), ".2f"), "%")

How many of vocab words are in opwordset: 6.09 %


In [119]:
def extract_known_words(words, w2v):
    known_words = []
    unknown_words = []

    for word in words:
        try:
            _ = w2v[word]
            known_words.append(word)
        except:
            unknown_words.append(word)
    return known_words, unknown_words

In [120]:
known_pos, unknown_pos = extract_known_words(op.positive_words_set, w2v)

In [124]:
uk_new = set(x.replace('-', "_") for x in unknown_pos)

In [126]:
uk_k , uk_uk = extract_known_words(uk_new, w2v)

In [128]:
uk_k

['user_replaceable',
 'light_hearted',
 'user_friendly',
 'fast_paced',
 'jaw_dropping',
 'eye_catching',
 'fastest_growing',
 'pre_eminent',
 'problem_solver',
 'self_sufficiency',
 'razor_sharp',
 'well_wishers',
 'law_abiding',
 'well_intentioned']

In [137]:
new_op_pos = set()
for w in known_pos+uk_k:
    new_op_pos = new_op_pos.union(set(s[0] for s in w2v.most_similar(positive=[w, 'excellent'], topn = 5) ))

In [145]:
new_op_pos = set(x.replace('_', "-") for x in new_op_pos)

In [146]:
len(new_op_pos.union(op.positive_words_set))

2680

In [148]:
known_neg, unknown_neg = extract_known_words(op.negative_words_set, w2v)

In [149]:
len(unknown_neg)

339

In [150]:
uk_neg_new = set(x.replace('-', "_") for x in unknown_neg)

In [151]:
uk_neg_k , uk_neg_uk = extract_known_words(uk_neg_new, w2v)

In [152]:
len(uk_neg_uk)

317

In [153]:
new_op_neg = set()
for w in known_neg+uk_neg_k:
    new_op_neg = new_op_neg.union(set(s[0] for s in w2v.most_similar(positive=[w, 'bad'], topn = 5) ))

In [154]:
new_op_neg = set(x.replace('_', "-") for x in new_op_neg)

In [155]:
len(new_op_neg.union(op.negative_words_set))

7668

In [156]:
new_op_neg

{'Overweight',
 'verbal-altercation',
 'precedent-Kasdan',
 'PG.L-Top-price',
 'slavery',
 'outlaws',
 'gloomier',
 'shame',
 'bumpy',
 'misfits',
 'prolonged-drought',
 'misjudge',
 'faux-pas',
 'Teodor-Manolache',
 'mistreat',
 'surprising',
 'denunciation',
 'prudish',
 'Unhealthy',
 'fatcats',
 'jokingly',
 'shortage',
 'protest',
 'spook',
 'miscommunication',
 'sharp',
 'disabling',
 'wrongdoing',
 'dire',
 'fear-mongering',
 'pickets',
 'discriminates',
 'oppress',
 'Corruption',
 'undermines',
 'unattractive',
 'interruption',
 'headcase',
 'unnecessary',
 'whiney',
 'moan',
 'machete',
 'neener-neener-neener',
 'sarcastically',
 'overturning',
 'fickle-mistress',
 'turny',
 'defective',
 'explosive-device',
 'Corrupt',
 'tampered',
 'fad',
 'underdo',
 'sensationalist',
 'villains',
 'stereotyping',
 'loser',
 'surveyed-disapproved',
 'unilateralist',
 'anomaly',
 'frauds',
 'tepid',
 'tragic',
 'hamstrung',
 'picket-lines',
 'danger',
 'worrying',
 'destroyers',
 'hampered',


In [157]:
new_op_pos = (new_op_pos - op.negative_words_set).union(op.positive_words_set)

In [158]:
new_op_neg = (new_op_neg - op.positive_words_set).union(op.negative_words_set)

In [160]:
new_op_pos.intersection(new_op_neg)

{'adequately',
 'beatable',
 'crucial',
 'cunning',
 'domination',
 'envious',
 'enviously',
 'enviousness',
 'heavier',
 'panache',
 'pretty-darn',
 'unconcerned',
 'unimpressed'}

In [163]:
new_op_pos = new_op_pos - set(['crucial', 'domination','unimpressed', 'unconcerned', 'heavier'])

In [164]:
new_op_neg = new_op_neg - set(['adequately', 'beatable','cunning', 'envious', 'enviously', 'enviousness', 'panache','pretty-darn'])

In [165]:
new_op_words = new_op_pos.union(new_op_neg)

In [166]:
print("How many of op words are in vocab:", format(len(vocab_set.intersection(new_op_words))*100/len(new_op_words), ".2f"), "%")

How many of op words are in vocab: 41.97 %


In [167]:
print("How many of vocab words are in opwordset:", format(len(vocab_set.intersection(new_op_words))*100/len(vocab_set), ".2f"), "%")

How many of vocab words are in opwordset: 7.57 %


In [173]:
with open('new-positive-words.txt','w') as file:
    for word in new_op_pos:
        file.write(word+"\n")
    file.close()
    

In [175]:
with open('new-negative-words.txt','w') as file:
    for word in new_op_neg:
        file.write(word+"\n")
    file.close()
    