# Stance Detection System

In [None]:
import numpy as np
import gensim
import unicodedata
import string
from nltk.corpus import stopwords
from sklearn import preprocessing, linear_model
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        #self.dim = len(word2vec.items().next())
        self.dim=150

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

Creating the stoplist of words for Catalan and Spanish

In [None]:
punctuation = str.maketrans('', '', string.punctuation)
file = open("stopwords-ca.txt", "r")
stoplist = file.readlines()
for i in range(0,len(stoplist)):
    stoplist[i] = stoplist[i].rstrip()
stoplist += (stopwords.words('spanish'))

Loading and preprocessing the data

In [None]:
with open("Stance-IberEval2017-training-20170320/training_tweets_es.txt", "r") as lines:
    w2vES=[line.split(':::',1)[1].encode('utf-8') for line in lines]
    for i in range(0,len(w2vES)):
        tmp = ''.join(c for c in unicodedata.normalize('NFD', w2vES[i].decode('utf-8'))
                  if unicodedata.category(c) != 'Mn')
        w2vES[i] = (tmp.translate(punctuation)).encode('utf-8')
    
print(len(w2vES))
print("Start")
XES=[[word.lower() for word in line.split() if word.decode("utf-8").lower() not in stoplist] for line in w2vES]
print(XES[0])
print(len(XES))

with open("Stance-IberEval2017-training-20170320/training_tweets_ca.txt", "r") as lines:
    w2vCA=[line.split(':::',1)[1].encode('utf-8') for line in lines]
    for i in range(0,len(w2vCA)):
        tmp = ''.join(c for c in unicodedata.normalize('NFD', w2vCA[i].decode('utf-8'))
                  if unicodedata.category(c) != 'Mn')
        w2vCA[i] = (tmp.translate(punctuation)).encode('utf-8')
    
print(len(w2vCA))
print("Start")
XCA=[[word.lower() for word in line.split() if word.decode("utf-8").lower() not in stoplist] for line in w2vCA]
print(XCA[0])
print(len(XCA))

Word2vec Embedding

In [None]:
modelES = gensim.models.Word2Vec(XES, size=150 ,min_count=5)
w2vES = dict(zip(modelES.wv.index2word, modelES.wv.syn0))

print("Embedding data ES")
#Embeddingvectorizer
Z=MeanEmbeddingVectorizer(w2vES)
Z1ES=Z.transform(XES)

modelCA = gensim.models.Word2Vec(XCA, size=150 ,min_count=5)
w2vCA = dict(zip(modelCA.wv.index2word, modelCA.wv.syn0))

print("Embedding data CA")
#Embeddingvectorizer
Z=MeanEmbeddingVectorizer(w2vCA)
Z1CA=Z.transform(XCA)

Preprocessing the labels of the dataset

In [None]:
with open("Stance-IberEval2017-training-20170320/training_truth_es.txt") as lines:
    labelsES = [line.split(':::',2)[1].encode('utf-8') for line in lines]

with open("Stance-IberEval2017-training-20170320/training_truth_ca.txt") as lines:
    labelsCA = [line.split(':::',2)[1].encode('utf-8') for line in lines]

SVM Classifier

In [None]:
le = preprocessing.LabelEncoder()
labelsES = le.fit_transform(labelsES)


kf = KFold(n_splits=10) 
kf.get_n_splits(Z1ES)
#clf = RandomForestClassifier(n_estimators=125, random_state=0)
#logreg = linear_model.LogisticRegression(C=2e5)
clf = SVC(C=2.9e5)
print('Spanish:')
for train_index, test_index in kf.split(Z1ES):
    X_train, X_test = Z1ES[train_index], Z1ES[test_index]
    y_train, y_test = labelsES[train_index], labelsES[test_index]
    clf.fit(X_train, y_train)
    print(f1_score(y_test, clf.predict(X_test), average='macro'))
    
    
labelsCA = le.fit_transform(labelsCA)


kf = KFold(n_splits=10) 
kf.get_n_splits(Z1CA)
#clf = RandomForestClassifier(n_estimators=125, random_state=0)
#logreg = linear_model.LogisticRegression(C=2e5)   
clf = SVC(C=2.7e5)
print('Catalan:')
for train_index, test_index in kf.split(Z1CA):
    X_train, X_test = Z1CA[train_index], Z1CA[test_index]
    y_train, y_test = labelsCA[train_index], labelsCA[test_index]
    clf.fit(X_train, y_train)
    print(f1_score(y_test, clf.predict(X_test), average='macro'))