In [307]:
import numpy as np
import pandas as pd

### Loading seed dicts

In [308]:
with open('positive_seed_DK_ALL_CORR.txt') as f:
    pos_dk = [line.split('\t') for line in f.readlines()][:200]

In [309]:
with open('negative_seed_DK_ALL.txt') as f:
    neg_dk = [line.split('\t') for line in f.readlines()][:200]

In [310]:
words_pos = [line[0] for line in pos_dk]
postags_pos = [line[1] for line in pos_dk]

pos_list = ["-".join(el) for el in zip(words_pos, postags_pos)]

In [311]:
words_neg = [line[0] for line in neg_dk]
postags_neg = [line[1] for line in neg_dk]

neg_list = ["-".join(el) for el in zip(words_neg, postags_neg)]

### Loading GloVe embeddings

In [312]:
with open('glove/vectors.txt', 'r') as f:
    glove = [line.split('\n')[:-1] for line in f.readlines()]

In [313]:
glovedict = dict()

for embedding in glove:
    split = embedding[0].split(" ")
    glovedict[split[0]] = [float(number) for number in split[1:]]

#### Frequency dictionary for word embeddings

In [314]:
with open('glove/vocab.txt', 'r') as f:
    vocab = [line.split('\n')[:-1] for line in f.readlines()]

In [315]:
vocab_words = [line[0].split(" ")[0] for line in vocab]
vocab_freqs = [line[0].split(" ")[1] for line in vocab]

In [316]:
vocab_dict = dict(zip(vocab_words, vocab_freqs))

#### Check if seed words are in vocab - manual correction

In [317]:
vocab_list = [word.split('-')[0] for word in vocab_words]

In [318]:
vocab_words[vocab_list.index('takke')]

'takke-VERB'

In [319]:
for word in pos_list:
    if word not in list(glovedict.keys()):
        print(word, "MISSING")

In [320]:
for word in neg_list:
    if word not in list(glovedict.keys()):
        print(word, "MISSING")

#### Create embedding arrays for positive and negative seed lists 

.pop() also removes those from dict in order to prevent perfect matches

In [321]:
embedding_list_pos = []

for word in pos_list:
    embedding_list_pos.append(glovedict.pop(word))
    
pos_array = np.array(embedding_list_pos)

In [322]:
embedding_list_neg = []

for word in neg_list:
    embedding_list_neg.append(glovedict.pop(word))
    
neg_array = np.array(embedding_list_neg)

In [323]:
glovedict.pop('<unk>');

#### Create GloVe array

In [324]:
glove_array = np.array(list(glovedict.values()))

In [325]:
glove_array.shape

(55810, 150)

### Compute word scores

In [326]:
num_seed_words = 200
pos_array = pos_array[:num_seed_words]
neg_array = neg_array[:num_seed_words]

In [327]:
M_p = glove_array @ pos_array.T 
M_n = glove_array @ neg_array.T

In [328]:
shape = glove_array.shape

In [329]:
L_p = (np.linalg.norm(glove_array, axis = 1).reshape([shape[0], 1])) * (np.linalg.norm(pos_array, axis = 1))
L_n = (np.linalg.norm(glove_array, axis = 1).reshape([shape[0], 1])) * (np.linalg.norm(neg_array, axis = 1)) 

In [330]:
P = np.sum(M_p / L_p, axis = 1)
Q = np.sum(M_n / L_n, axis = 1)

In [331]:
S_i = P - Q

### Find top and bottom words

In [332]:
#normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler((-1, 1))
norm_score = scaler.fit_transform(S_i.reshape(-1, 1)).flatten()

In [333]:
words = np.array([list(glovedict.keys())]).flatten()
freqs = [int(vocab_dict[word]) for word in words]

df = pd.DataFrame({'word': words, 'score':norm_score, 'freq': freqs})

In [334]:
#frequency limits
df = df[df['freq'] < 15000]
df = df[df['freq'] > 100]

In [335]:
#splitting word column into lemma and pos columns
df[['lemma', 'pos']] = df['word'].str.split(r'\-+(?=[A-Z])', regex = True, expand = True)

In [336]:
#filterning out numbers, proper nouns and punctuation
df = df[df['pos'] != 'NUM']
df = df[df['pos'] != 'PROPN']
df = df[df['pos'] != 'PUNCT']
df = df[df['lemma'].str.isnumeric() == False]

In [337]:
#sorting by score 
sorted_df = df.sort_values(['score'])

### Positive expansion

In [338]:
pos_expansion = sorted_df[-500:].iloc[::-1]

In [339]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(pos_expansion)

                             word  score   freq                   lemma    pos
474                    takke-VERB  1.000   7866                   takke   VERB
341               samarbejde-NOUN  0.972  11703              samarbejde   NOUN
457                    glæde-VERB  0.964   8086                   glæde   VERB
4925          repræsentation-NOUN  0.928    285          repræsentation   NOUN
674                   levere-VERB  0.915   5013                  levere   VERB
2019                     flot-ADJ  0.914   1143                    flot    ADJ
1122                kvittere-VERB  0.891   2606                kvittere   VERB
629                opbakning-NOUN  0.885   5429               opbakning   NOUN
1204                   dejlig-ADJ  0.877   2359                  dejlig    ADJ
1130                 tilbyde-VERB  0.863   2567                 tilbyde   VERB
347                    sikker-ADJ  0.840  11622                  sikker    ADJ
2695                tillykke-INTJ  0.839    739     

In [340]:
pos_expansion.to_csv('positive_expansion_DK.csv')

### Negative expansion

In [341]:
neg_expansion = sorted_df[:500]

In [342]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(neg_expansion)

                                  word  score  freq  \
5968                    forårsage-VERB -0.886   207   
4310                    utilsigtet-ADJ -0.855   352   
7152                     eskalere-VERB -0.855   153   
5955                  spekulation-NOUN -0.788   208   
8488                      afværge-VERB -0.779   115   
7688                       fyring-NOUN -0.769   136   
7618                 komplikation-NOUN -0.760   138   
4646                      hærværk-NOUN -0.750   312   
7153                         fatal-ADJ -0.738   153   
3901                oversvømmelse-NOUN -0.736   413   
8344                 teknikalitet-NOUN -0.731   119   
5469                     overfald-NOUN -0.720   241   
3070                        unødig-ADJ -0.719   598   
7086                   forældelse-NOUN -0.716   155   
5673                      episode-NOUN -0.712   225   
8005                    verserende-ADJ -0.700   127   
8228                helbredsmæssig-ADJ -0.697   121   
8429      

In [343]:
neg_expansion.to_csv('negative_expansion_DK.csv')

### Save to file - POSITIVE

#### Combine seed and expansion lemmas and POS

In [344]:
for word in list(pos_expansion['lemma']):
    words_pos.append(word)
for postag in list(pos_expansion['pos']):
    postags_pos.append(postag)

#### Combine seed and expansion frequency-weighted scores)

In [345]:
#create scores for seed list
pos_seed_score = [1 for word in pos_list]
for score in list(pos_expansion['score']):
    pos_seed_score.append(score)

#### Write to file

In [346]:
with open('sentdict_pos_DK.txt', 'w', encoding = 'utf-8') as f: 
    for word, postag, score in zip(words_pos, postags_pos, pos_seed_score):
        f.write(word + '\t' + postag + '\t' + str(score) + '\n')

### Save to file - NEGATIVE

#### Combine seed and expansion lemmas and POS

In [347]:
for word in list(neg_expansion['lemma']):
    words_neg.append(word)
for postag in list(neg_expansion['pos']):
    postags_neg.append(postag)

#### Combine seed and expansion frequency-weighted scores)

In [348]:
#create scores for seed list
neg_seed_score = [-1 for word in neg_list]
for score in list(neg_expansion['score']):
    neg_seed_score.append(score)

#### Write to file

In [349]:
with open('sentdict_neg_DK.txt', 'w', encoding = 'utf-8') as f: 
    for word, postag, score in zip(words_neg, postags_neg, neg_seed_score):
        f.write(word + '\t' + postag + '\t' + str(score) + '\n')