In [36]:
import numpy as np
import pandas as pd

### Loading seed dicts

In [37]:
with open('positive_seed_BG_ALL_CORR.txt') as f:
    pos_bg = [line.split('\t') for line in f.readlines()][:200]

In [38]:
with open('negative_seed_BG_ALL_CORR.txt') as f:
    neg_bg = [line.split('\t') for line in f.readlines()][:200]

In [39]:
words_pos = [line[0] for line in pos_bg]
postags_pos = [line[1] for line in pos_bg]

pos_list = ["-".join(el) for el in zip(words_pos, postags_pos)]

In [40]:
words_neg = [line[0] for line in neg_bg]
postags_neg = [line[1] for line in neg_bg]

neg_list = ["-".join(el) for el in zip(words_neg, postags_neg)]

### Loading GloVe embeddings

In [41]:
with open('glove/vectors_bg.txt', 'r') as f:
    glove = [line.split('\n')[:-1] for line in f.readlines()]

In [42]:
glovedict = dict()

for embedding in glove:
    split = embedding[0].split(" ")
    glovedict[split[0]] = [float(number) for number in split[1:]]

#### Frequency dictionary for word embeddings

In [43]:
with open('glove/vocab_bg.txt', 'r') as f:
    vocab = [line.split('\n')[:-1] for line in f.readlines()]

In [44]:
vocab_words = [line[0].split(" ")[0] for line in vocab]
vocab_freqs = [line[0].split(" ")[1] for line in vocab]

vocab_dict = dict(zip(vocab_words, vocab_freqs))

#### Check if seed words are in vocab - manual correction

In [45]:
for word in pos_list:
    if word not in list(glovedict.keys()):
        print(word, "MISSING")

In [46]:
for word in neg_list:
    if word not in list(glovedict.keys()):
        print(word, "MISSING")

#### Create embedding arrays for positive and negative seed lists 

.pop() also removes those from dict in order to prevent perfect matches

In [47]:
embedding_list_pos = []

for word in pos_list:
    embedding_list_pos.append(glovedict.pop(word))
    
pos_array = np.array(embedding_list_pos)

In [48]:
embedding_list_neg = []

for word in neg_list:
    embedding_list_neg.append(glovedict.pop(word))
    
neg_array = np.array(embedding_list_neg)

In [49]:
glovedict.pop('<unk>');

#### Create GloVe array

In [50]:
glove_array = np.array(list(glovedict.values()))

In [51]:
glove_array.shape

(34762, 150)

### Compute word scores

In [52]:
num_seed_words = 200
pos_array = pos_array[:num_seed_words]
neg_array = neg_array[:num_seed_words]

In [53]:
M_p = glove_array @ pos_array.T 
M_n = glove_array @ neg_array.T

In [54]:
shape = glove_array.shape

In [55]:
L_p = (np.linalg.norm(glove_array, axis = 1).reshape([shape[0], 1])) * (np.linalg.norm(pos_array, axis = 1))
L_n = (np.linalg.norm(glove_array, axis = 1).reshape([shape[0], 1])) * (np.linalg.norm(neg_array, axis = 1)) 

In [56]:
P = np.sum(M_p / L_p, axis = 1)
Q = np.sum(M_n / L_n, axis = 1)

In [57]:
S_i = P - Q

### Find top and bottom words

In [58]:
#normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler((-1, 1))
norm_score = scaler.fit_transform(S_i.reshape(-1, 1)).flatten()

In [59]:
#create word and frequency lists based on 
words = np.array([list(glovedict.keys())]).flatten()
freqs = [int(vocab_dict[word]) for word in words]

df = pd.DataFrame({'word': words, 'score':norm_score, 'freq': freqs})

In [60]:
#frequency limits
df = df[df['freq'] < 15000]
df = df[df['freq'] > 100]

In [61]:
#splitting word column into lemma and pos columns
df[['lemma', 'pos']] = df['word'].str.split(r'\-+(?=[A-Z])', regex = True, expand = True)

In [62]:
#filterning out numbers, proper nouns and punctuation
df = df[df['pos'] != 'NUM']
df = df[df['pos'] != 'PROPN']
df = df[df['pos'] != 'PUNCT']
df = df[df['lemma'].str.isnumeric() == False]

In [63]:
#sorting by score 
sorted_df = df.sort_values(['score'])

### Positive expansion

In [64]:
#first 500 most positive words
pos_expansion = sorted_df[-500:].iloc[::-1]

In [65]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(pos_expansion)

                           word  score   freq                  lemma    pos
596                осигуря-VERB  1.000   5405                осигуря   VERB
632             гарантирам-VERB  0.967   5086             гарантирам   VERB
601                  заедно-ADV  0.893   5369                 заедно    ADV
324               подкрепа-NOUN  0.891  10529               подкрепа   NOUN
2966               пожелая-VERB  0.875    673                пожелая   VERB
317                   нека-PART  0.833  10676                   нека   PART
1489           гарантиране-NOUN  0.815   1817            гарантиране   NOUN
711                 усилие-NOUN  0.800   4525                 усилие   NOUN
1728              разчитам-VERB  0.794   1473               разчитам   VERB
241             давам-(се)-VERB  0.790  14123             давам-(се)   VERB
724              приоритет-NOUN  0.781   4429              приоритет   NOUN
1390           запазя-(се)-VERB  0.780   1996            запазя-(се)   VERB
931         

In [66]:
#save to file
pos_expansion.to_csv('positive_expansion_BG.csv')

### Negative expansion

In [67]:
neg_expansion = sorted_df[:500]

In [68]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(neg_expansion)

                         word  score  freq               lemma   pos
7891          причинявам-VERB -0.827   115          причинявам  VERB
4441             очевиден-ADJ -0.791   337            очевиден   ADJ
7527     закононарушение-NOUN -0.725   127     закононарушение  NOUN
7887            пораждам-VERB -0.715   115            пораждам  VERB
7507          инфлационен-ADJ -0.696   128         инфлационен   ADJ
7242          предизвикам-ADJ -0.695   137         предизвикам   ADJ
5217          вследствие-NOUN -0.687   253          вследствие  NOUN
7905          безхаберие-NOUN -0.679   114          безхаберие  NOUN
6852              повред-NOUN -0.678   153              повред  NOUN
3178              авария-NOUN -0.677   595              авария  NOUN
4168             умишлено-ADV -0.677   373            умишлено   ADV
4848       пораждам-(се)-VERB -0.674   290       пораждам-(се)  VERB
8361          форсмажорен-ADJ -0.662   103         форсмажорен   ADJ
2414      вследствие-(на)-ADP -0.6

In [69]:
neg_expansion.to_csv('negative_expansion_BG.csv')

### Save to file - POSITIVE

#### Combine seed and expansion lemmas and POS

In [70]:
for word in list(pos_expansion['lemma']):
    words_pos.append(word)
for postag in list(pos_expansion['pos']):
    postags_pos.append(postag)

#### Combine seed and expansion frequency-weighted scores)

In [71]:
#create scores for seed list
pos_seed_score = [1 for word in pos_list]
for score in list(pos_expansion['score']):
    pos_seed_score.append(score)

#### Write to file

In [72]:
with open('sentdict_pos_BG.txt', 'w', encoding = 'utf-8') as f: 
    for word, postag, score in zip(words_pos, postags_pos, pos_seed_score):
        f.write(word + '\t' + postag + '\t' + str(score) + '\n')

### Save to file - NEGATIVE

#### Combine seed and expansion lemmas and POS

In [73]:
for word in list(neg_expansion['lemma']):
    words_neg.append(word)
for postag in list(neg_expansion['pos']):
    postags_neg.append(postag)

#### Combine seed and expansion frequency-weighted scores)

In [74]:
#create scores for seed list
neg_seed_score = [-1 for word in neg_list]
for score in list(neg_expansion['score']):
    neg_seed_score.append(score)

#### Write to file

In [75]:
with open('sentdict_neg_BG.txt', 'w', encoding = 'utf-8') as f: 
    for word, postag, score in zip(words_neg, postags_neg, neg_seed_score):
        f.write(word + '\t' + postag + '\t' + str(score) + '\n')