## Predicting the polarity of a polar word

* use a polarity lexicon with positive and negative words
* take half of it to train a feedforward net and have to test
* use the embedding of a word as input representation


* a real setting would be to predict 3 classes: neutral, positive, negative


In [1]:
# code from our textbook

import torch
import numpy as np

def load_emb_from_file(filepath):
    """
    Load  embeddings 
    
    Args:
        filepath (str): path to the embeddings file 
    Returns:
        word_to_index (dict), embeddings (numpy.ndarary)
    """
  

    word_to_index = {}
    embeddings = []
    with open(filepath, "r") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

In [2]:
widx,emb=load_emb_from_file("/home/klenner/applications/gensim-0.13.4/vectors.txt")

In [3]:
# access the embedding of a word via its index

index=widx['freude']
word_embedding=emb[index]
index, word_embedding

(7737,
 array([-0.09773 , -0.073794, -0.052194, -0.499479, -0.319778,  0.157181,
         0.269456,  0.610306,  0.09075 ,  0.361076, -0.495249,  0.22715 ,
         0.155042,  0.316588,  0.479143, -0.076718, -0.102654, -0.714726,
         0.155304, -0.565182, -0.829133,  0.245589,  0.778199,  0.019168,
        -0.461478, -0.024124, -0.552235,  0.386096, -0.191506,  0.622527,
        -0.096904,  0.669733,  0.477398,  0.317568,  0.165939, -0.138258,
        -0.785988,  0.142101,  0.549931, -0.345421, -0.532162,  0.016093,
        -0.057987,  0.13559 ,  0.214938,  0.078067,  0.113646, -0.10188 ,
        -0.036438, -0.085324, -0.193315,  0.511736,  0.142473, -0.223453,
        -0.069988,  0.621426, -0.454478, -0.097701,  0.174274, -0.199256,
        -0.343834,  0.010616,  0.294296,  0.183556,  0.301568, -0.03067 ,
         0.723429, -0.305649,  0.408184, -0.204259, -0.268337, -0.153939,
         0.613482,  0.185519, -0.992607, -0.028342, -0.106007,  0.286124,
        -0.16511 , -0.546778,  

In [7]:
import pandas as pd
from sklearn.utils import shuffle

# return lemma,polarity pairs
polex=pd.read_csv("/home/klenner/python/python/data_lexicon/polex_de_clean",header=None,
                  index_col=[0,1],usecols=[0,1],names=['lemma', 'pol'])

ids=[]     # gather all embedding indices
index={}   # map index to polarity

for (l,p),_ in polex.iterrows():  # (lemma,polarity) pairs
    try:
        id=widx[l.lower()]  # lowercase, since word2vec version requests this
        if p=='POS':
            index[id]=1     # id is the word2vec index of lemma l
            ids.append(id)  # all ids for data split below
        elif p=='NEG':
            index[id]=0
            ids.append(id)
    except:
        pass

np.random.shuffle(ids)                    # random modifies ids directly
noun=[(id,index[id]) for id in ids]       # create input pairs: (word2vecID,polarity)

# split in train and test
train=noun[:2000]
test=noun[2001:]

# what is the baseline in a majority voting setting
pos=[1 for l,p in train if p == 1]
neg=[1 for l,p in train if p == 0]

a,b=len(pos),len(neg)   # well negative words are the majority class

baseline = b/(a+b)
baseline

(0.619, 1238, 762)

In [8]:
# would it work for the test set?

pos=[1 for l,p in test if p == 1]
neg=[1 for l,p in test if p == 0]

a,b=len(pos),len(neg)   # well negative words are the majority class

b/(a+b)

0.6081025217031831

In [9]:
len(emb)                       # vocabulary size

853624

In [12]:
import torch.nn as nn

embedding_dim=300

class Net(nn.Module):   
    def __init__(self,):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(embedding_dim,1)  
    
    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))  
        return x
    
net=Net()    

In [13]:
import torch.optim as optim

optimizer = optim.SGD(net.parameters(), lr=0.01)

loss_func = nn.BCELoss()

for epoch in range(2):
    for id, label in  train:
        input=torch.tensor(emb[id], dtype=torch.float32, requires_grad=True)
        optimizer.zero_grad()       
        outputs = net(input)
        label=torch.tensor([label],dtype=torch.float)
        loss = loss_func(outputs, label )
        loss.backward()
        optimizer.step()

In [14]:
def step(x):
    if x < 0.5:
        return 0
    else:
        return 1
    
tp=0
for wid ,label in test:
    input=emb[wid]
    input=torch.tensor(input, dtype=torch.float32)  
    output = net(input)
    
    if step(output)==label:
        tp+=1

acc=tp/len(test)
acc

0.8515915667631253

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([[2,-1],[-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])

nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(X)

distances, indices = nbrs.kneighbors(Y)

distances,indices         

indices: e.g. for the first element of the Y, 4 from X is the closest neighbor

array([[4],
        [1],
        [2],
        [3],
        [4],
        [5]]))

In [15]:
from sklearn.neighbors import NearestNeighbors

X_train = [emb[index] for (index,_) in train]
pols = [pol for (_,pol) in train]
neigh = NearestNeighbors(n_neighbors=1)
neigh.fit(X_train) 

tp=0
all=0
for (e_index,label) in test:
    embedding=emb[e_index]
    _,nn_index=neigh.kneighbors(embedding.reshape(1, -1))
    train_index=nn_index[0][0]
    knn_label=pols[train_index]
    if label==knn_label:
        tp+=1
    all+=1

    
print(tp/all)

0.7329474989665151
