## Predicting the polarity of a  word

* use a polarity lexicon with positive, negative and neutral words
* take 60% for training
* use word embeddings as features

* test traditional ML and MLP


* result SVM (non-linear support vector machine) and MLP are on par

In [1]:
# get the fasttext embeddings 

import torch
import numpy as np

def load_emb_from_file(filepath):

    word_to_index = {}
    embeddings = []
    with open(filepath, "r") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, embeddings

In [2]:
# index, array of word embeddings

widx,emb=load_emb_from_file("/home/klenner/Lehre/ml20/cc.de.300.vec")

In [4]:
import pandas as pd
from sklearn.utils import shuffle

# data of learning task: polarity lexicon for German
polex=pd.read_csv("/home/klenner/Lehre/ml20/polexNeutNomen",header=None,
                  index_col=[0,1],usecols=[0,1],names=['lemma', 'pol'])

#polex=pd.read_csv("/home/klenner/applications/jupyter/rnn-notebook/word_majority_label_ag.tsv",header=None,
#                  index_col=[0,1],usecols=[0,1],names=['lemma', 'pol'])

ids=[]     # gather all embedding indices
index={}   # map index to polarity
oov=0      # out of vocabolary counter

for (l,p),_ in polex.iterrows():  # (lemma,polarity) pairs
    try:
        id=widx[l]  
        if p=='POS' or p=='PRO':
            index[id]=1     # id is the word2vec index of lemma l
            ids.append(id)  # all ids for data split below
        elif p=='NEG' or p=='CON':
            index[id]=0
            ids.append(id)
        else:
            index[id]=2
            ids.append(id)
    except:
        oov+=1
        pass

np.random.shuffle(ids)                    # random modifies ids directly
noun=[(id,index[id]) for id in ids]       # create input pairs: (word2vecID,polarity)

corpus_len=len(ids)
trainsplit= int(corpus_len*0.6)
                
# split in train and test
train=noun[:trainsplit]
test=noun[trainsplit+1:]


# what is the baseline in a majority voting setting
pos=[1 for l,p in train if p == 1]
neg=[1 for l,p in train if p == 0]
neu=[1 for l,p in train if p == 2]

a,b,c =len(pos),len(neg),len(neu)   # 

baseline = b/(a+b+c)    # Polex b, Anne c

len(noun),a,b,c,baseline,oov

(3656, 649, 1171, 373, 0.5339717282261742, 319)

In [5]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score 
from sklearn.metrics import accuracy_score

from sklearn import tree

# create input for sklearn algorithms
X_train = [emb[index] for (index,_) in train]
y_train = [label for (_,label) in train]

X_test = [emb[index] for (index,_) in test]
y_test =  [label for (_,label) in test]

clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X_train,y_train)

y_test_predict=clf.predict(X_test)

print(accuracy_score(y_test,y_test_predict))
precision_score(y_test,y_test_predict,average=None), recall_score(y_test,y_test_predict,average=None), f1_score(y_test,y_test_predict,average=None)


0.7113543091655267


(array([0.78516624, 0.62632696, 0.62679426]),
 array([0.77820025, 0.65121413, 0.59545455]),
 array([0.78166773, 0.63852814, 0.61072261]))


             neg     pos        neut
prec: (array([0.75745785, 0.58719647, 0.60504202]),

~rec: array([0.7664042 , 0.59375   , 0.57142857]),

~ ~f1: array([0.76190476, 0.59045505, 0.5877551 ]))

In [6]:
from sklearn.linear_model import Perceptron

clf = Perceptron(tol=1e-3)

clf.fit(X_train, y_train)

y_test_predict=clf.predict(X_test)

print(accuracy_score(y_test,y_test_predict))
precision_score(y_test,y_test_predict,average=None), recall_score(y_test,y_test_predict,average=None), f1_score(y_test,y_test_predict,average=None)


0.9151846785225718


(array([0.91716687, 0.91606715, 0.90566038]),
 array([0.96831432, 0.84326711, 0.87272727]),
 array([0.94204686, 0.87816092, 0.88888889]))

In [10]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver= 'lbfgs', max_iter=1000,random_state=0,multi_class='multinomial')
clf.fit(X_train, y_train)

y_test_predict=clf.predict(X_test)

print(accuracy_score(y_test,y_test_predict))
precision_score(y_test,y_test_predict,average=None), recall_score(y_test,y_test_predict,average=None), f1_score(y_test,y_test_predict,average=None)


0.9138166894664843


(array([0.92597087, 0.87692308, 0.95081967]),
 array([0.96704689, 0.8807947 , 0.79090909]),
 array([0.94606324, 0.87885463, 0.86352357]))

**The only difference between Perceptron and Multinomial Logistic Regression: softmax versus step function!**

In [7]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train,y_train)

y_test_predict=neigh.predict(X_test)

print(accuracy_score(y_test,y_test_predict))
precision_score(y_test,y_test_predict,average=None), recall_score(y_test,y_test_predict,average=None), f1_score(y_test,y_test_predict,average=None)


0.8454172366621067


(array([0.882494  , 0.76254826, 0.95454545]),
 array([0.93282636, 0.87196468, 0.47727273]),
 array([0.90696242, 0.81359423, 0.63636364]))

In [8]:
from sklearn import svm

clf2 = svm.SVC(kernel='linear', C=1.0, random_state=0)

clf2.fit(X_train,y_train)

y_test_predict=clf2.predict(X_test)

print(accuracy_score(y_test, y_test_predict))
precision_score(y_test,y_test_predict,average=None), recall_score(y_test,y_test_predict,average=None), f1_score(y_test,y_test_predict,average=None)


0.9076607387140903


(array([0.93014706, 0.85327314, 0.93596059]),
 array([0.95712484, 0.8852459 , 0.78512397]),
 array([0.94344313, 0.86896552, 0.85393258]))

In [9]:
from sklearn import svm

clf4 = svm.SVC(kernel='rbf', random_state=0, gamma=0.10, C=10.0)

clf4.fit(X_train,y_train)

y_test_predict=clf4.predict(X_test)

print(accuracy_score(y_test, y_test_predict))
precision_score(y_test,y_test_predict,average=None), recall_score(y_test,y_test_predict,average=None), f1_score(y_test,y_test_predict,average=None)


0.908344733242134


(array([0.92892157, 0.86175115, 0.9245283 ]),
 array([0.95586381, 0.87587822, 0.80991736]),
 array([0.94220012, 0.86875726, 0.86343612]))

In [11]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='adam', alpha=1e-5, activation='relu',
                    hidden_layer_sizes=(300,10), random_state=1,validation_fraction=0.2,early_stopping=False)
                  
clf.fit(X_train, y_train)

y_test_predict=clf.predict(X_test)

print(precision_score(y_test,y_test_predict,average=None), recall_score(y_test,y_test_predict,average=None), f1_score(y_test,y_test_predict,average=None))

accuracy_score(y_test,y_test_predict)

[0.94132653 0.85487528 0.89451477] [0.93064313 0.88290398 0.87603306] [0.93595434 0.86866359 0.88517745]


0.9076607387140903

In [None]:
from sklearn.linear_model import SGDClassifier
#sgd=SGDClassifier(random_state=42, max_iter=5)
sgd=SGDClassifier(random_state=42,loss='log', alpha=0.01,max_iter=5)

sgd.fit(X_train,y_train)

y_test_predict=sgd.predict(X_test)

print(precision_score(y_test,y_test_predict,average=None), recall_score(y_test,y_test_predict,average=None), f1_score(y_test,y_test_predict,average=None))
accuracy_score(y_test,y_test_predict)

In [None]:
from sklearn.model_selection import GridSearchCV

def svc_param_selection(X, y, nfolds):
    loss= ['hinge', 'log', 'modified_huber','squared_hinge', 'perceptron']
    alpha=[0.5,0.1,0.01,0.001,0.0001]
    maxiter=[1,4,5,10,11,12,15,20]
    param_grid = {'loss': loss,'alpha':alpha,'max_iter':maxiter}
    grid_search = GridSearchCV(sgd, param_grid, cv=nfolds, n_jobs=-1)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

p=svc_param_selection(X_train,y_train,5)
p

In [None]:
sgd=SGDClassifier(random_state=42,loss='hinge', alpha=0.0001,max_iter=10)

sgd.fit(X_train,y_train)

y_test_predict=sgd.predict(X_test)

accuracy_score(y_test,y_test_predict)

In [None]:
from sklearn.model_selection import GridSearchCV

def svc_param_selection(X, y, nfolds):
    activation= ['tanh','relu']
    alpha=[0.5,0.1,0.01,0.001,0.0001]
    solver=['lbfgs', 'sgd', 'adam']
    early_stopping=[False,True]
    param_grid = {'activation': activation,'alpha':alpha,'solver':solver,'early_stopping':early_stopping}
    grid_search = GridSearchCV(clf, param_grid, cv=nfolds, n_jobs=-1)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

p=svc_param_selection(X_train,y_train,5)
p

In [None]:
def svc_param_selection(X, y, nfolds):
    hidden_layer_sizes=[(200,10),(300,10),(200,50),(300,100),(200,100)]
    activation= ['relu']
    solver=['adam']
    alpha=[0.1]
    early_stopping=[True]
    param_grid = {'hidden_layer_sizes':hidden_layer_sizes,'activation': activation,'alpha':alpha,'solver':solver,'early_stopping':early_stopping}
    grid_search = GridSearchCV(clf, param_grid, cv=nfolds, n_jobs=-1)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

p=svc_param_selection(X_train,y_train,5)
p

In [12]:
# and the Winner is MLP (learn_rate of 0.1 is worse)

clf = MLPClassifier(solver='adam', alpha=1e-5, activation='relu',
                    hidden_layer_sizes=(300,10), random_state=1,validation_fraction=0.2,early_stopping=True)
                  
clf.fit(X_train, y_train)

y_test_predict=clf.predict(X_test)

print(precision_score(y_test,y_test_predict,average=None), recall_score(y_test,y_test_predict,average=None), f1_score(y_test,y_test_predict,average=None))

accuracy_score(y_test,y_test_predict)

[0.93316832 0.86111111 0.92342342] [0.95081967 0.87119438 0.84710744] [0.94191131 0.8661234  0.88362069]


0.9103967168262654