In [2]:
import os
os.listdir('.')

['.config',
 'TASS2017_T1_development.xml',
 'TASS2017_T1_training.xml',
 'ElhPolar_esV1.lex',
 'TASS2017_T1_test.xml',
 'sample_data']

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Exercice 2. Sentiment Analysis

## Modelo 1

In [11]:
from sklearn import svm
from sklearn.metrics import classification_report
from nltk.tokenize import TweetTokenizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
import re
from sklearn import svm
from sklearn.metrics import classification_report
import scipy
import re
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


train_path = "TASS2017_T1_training.xml"
dev_path = "TASS2017_T1_development.xml"
test_path = "TASS2017_T1_test.xml"

def preprocess_xml(path, is_test=False):
    # Loading data from xml and tokenizing
    x=[]
    y=[]
    with open(path, "r") as f:
        soup = BeautifulSoup(f, "xml")
    tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True, preserve_case=False)

    for tweet in soup.find_all("tweet"):
        content = tweet.content.string
        x.append(" ".join(tokenizer.tokenize(content)))
        if not is_test:
          sentiment = tweet.sentiment.polarity.value.string
          y.append(sentiment)

    return x,y


def mi_tokenizador(s):
    expresion = ['([0-9]+)(\sde\s)(\w+)(\sde\s)([0-9]+)', '(http(s)*://)([^\s]*)', '([0-9]{1,2})(\/|-|:)([0-9]{1,2})(\/|-|:)([0-9]{4}|[0-9]{2})',
                  '([^\s]+@[^\s]+)', '([@|#]([^\s]+))', '([À-úA-Z]+\.)+', '([0-9])+(\.|,|\-|\:|\/)([0-9]+)',  '([À-úa-zA-Z])+(\-)([À-úa-zA-Z]+)',
                  '([^\w\s.:;,!¡/@¿?%~\"\'\#-])', '\(', '\)', '\.\.\.', '\.', '\,', '\'', '\"', '\?', '\¿', '\!','\¡', '\;', '\:','\%','\w+']
    reg_exp = "|".join(expresion)
    compilador= re.compile(reg_exp, re.U)
    return [i.group(0)  for i in re.finditer(compilador, s)]


def load_lexicon(path):
    polarities= {}
    with open(path, 'r') as f:
        line = f.readline()
        while line:
          if not line.startswith('#') and line!="\n":
              terms = line.split()
              term = terms[0]
              if terms[-1] == 'positive':
                  polarities[term] = "POS"
              elif terms[-1]=='negative':
                  polarities[term] = "NEG"
              else:
                  polarities[term] = "NEU"
          line = f.readline()
    return polarities


def get_polarities(lexicon, data):
  result=[]
  for line in data:
    positive = 0
    negative = 0
    for word in line.split():
      polarity = lexicon.get(word, "NEU")
      if polarity == "POS":
        positive +=1
      elif polarity == "NEG":
        negative +=1
    result.append([positive, negative])
  return result


# Data preprocessing
train_x, train_y = preprocess_xml(train_path)
dev_x, dev_y = preprocess_xml(dev_path)
test_x, test_y = preprocess_xml(test_path, is_test=True)

# Vectorize the data
vectorizer = CountVectorizer(tokenizer=mi_tokenizador)
train_vectors = vectorizer.fit_transform(train_x)
dev_vectors = vectorizer.transform(dev_x)

# External resources
lexicon = load_lexicon('ElhPolar_esV1.lex')

# Get polarities of train data and append them to the train vectors
train_polaridades = get_polarities(lexicon, train_x)
#train_vectors=scipy.sparse.hstack([train_vectors, train_polaridades])

# Train classifier
classifier_liblinear = svm.LinearSVC(C=0.1)
classifier_liblinear.fit(train_vectors, train_y)

# Get polarities of the dev data and append them to the dev vectors
dev_polaridades = get_polarities(lexicon, dev_x)
#dev_vectors=scipy.sparse.hstack([dev_vectors, dev_polaridades])

# Eval classifier with dev
prediction_liblinear = classifier_liblinear.predict(dev_vectors)
print(classification_report(dev_y, prediction_liblinear))

              precision    recall  f1-score   support

           N       0.60      0.79      0.68       219
         NEU       0.27      0.10      0.15        69
        NONE       0.26      0.11      0.16        62
           P       0.60      0.62      0.61       156

    accuracy                           0.56       506
   macro avg       0.43      0.41      0.40       506
weighted avg       0.51      0.56      0.52       506



## Modelo 2

In [10]:
from sklearn import svm
from sklearn.metrics import classification_report
from nltk.tokenize import TweetTokenizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
import re
from sklearn import svm
from sklearn.metrics import classification_report
import scipy
import re
import tensorflow as tf
import keras
import tensorflow_hub as hub


train_path = "TASS2017_T1_training.xml"
dev_path = "TASS2017_T1_development.xml"
test_path = "TASS2017_T1_test.xml"
dev_labels = []

def preprocess_xml(path, is_test=False, is_dev=False):
    # Loading data from xml and tokenizing
    x=[]
    y=[]
    with open(path, "r") as f:
        soup = BeautifulSoup(f, "xml")
    tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True, preserve_case=False)

    for tweet in soup.find_all("tweet"):
        content = tweet.content.string
        x.append(" ".join(tokenizer.tokenize(content)))
        if not is_test:
          sentiment = tweet.sentiment.polarity.value.string
          if sentiment=="N":
            sentiment=0
          elif sentiment=="P":
            sentiment=1
          elif sentiment=="NEU":
            sentiment=2
          else:
            sentiment=3
          y.append(sentiment)
          if is_dev:
            dev_labels.append(sentiment)
    x = tf.data.Dataset.from_tensor_slices(x)
    y = keras.utils.np_utils.to_categorical(y, 4)
    y = tf.data.Dataset.from_tensor_slices(y)
    
    return x,y


# Data preprocessing
train_x, train_y = preprocess_xml(train_path)
train_dataset = tf.data.Dataset.zip((train_x, train_y))

dev_x, dev_y = preprocess_xml(dev_path, is_dev=True)
dev_dataset = tf.data.Dataset.zip((dev_x, dev_y))

test_x, test_y = preprocess_xml(test_path, is_test=True)
test_dataset = tf.data.Dataset.zip((test_x, test_y))

train_dataset = train_dataset.batch(128)
dev_dataset = dev_dataset.batch(128)
test_dataset = test_dataset.batch(128)



hub_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-es-dim128/1", output_shape=[128],
                           input_shape=[], dtype=tf.string)

model = keras.Sequential()
model.add(hub_layer)
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(4, activation='softmax'))

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_dataset, epochs=20)
test_loss, test_acc = model.evaluate(dev_dataset)
print('\nDev loss: {:.3f}, Dev accuracy: {:.3f}'.format(test_loss, test_acc))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_1 (KerasLayer)  (None, 128)               125009920 
                                                                 
 dense_2 (Dense)             (None, 16)                2064      
                                                                 
 dense_3 (Dense)             (None, 4)                 68        
                                                                 
Total params: 125,012,052
Trainable params: 2,132
Non-trainable params: 125,009,920
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Dev loss: 1.109, Dev accuracy: 0.571


In [6]:
import numpy as np
from sklearn.metrics import classification_report

y_hat = model.predict(dev_dataset)
y_hat = [np.argmax(i) for i in y_hat]
print(len(y_hat))
print(len(dev_labels))

print(classification_report(dev_labels, y_hat, target_names=["NEG","POS","NEU","NONE"]))


506
506
              precision    recall  f1-score   support

         NEG       0.54      0.83      0.66       219
         POS       0.54      0.58      0.56       156
         NEU       0.00      0.00      0.00        69
        NONE       0.67      0.03      0.06        62

    accuracy                           0.54       506
   macro avg       0.44      0.36      0.32       506
weighted avg       0.48      0.54      0.46       506



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##Modelo 3

In [14]:
from sklearn import svm
from sklearn.metrics import classification_report
from nltk.tokenize import TweetTokenizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
import re
from sklearn import svm
from sklearn.metrics import classification_report
import scipy
import re
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


train_path = "TASS2017_T1_training.xml"
dev_path = "TASS2017_T1_development.xml"
test_path = "TASS2017_T1_test.xml"

def preprocess_xml(path, is_test=False):
    # Loading data from xml and tokenizing
    x=[]
    y=[]
    with open(path, "r") as f:
        soup = BeautifulSoup(f, "xml")
    tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True, preserve_case=False)

    for tweet in soup.find_all("tweet"):
        content = tweet.content.string
        x.append(" ".join(tokenizer.tokenize(content)))
        if not is_test:
          sentiment = tweet.sentiment.polarity.value.string
          y.append(sentiment)

    return x,y


def mi_tokenizador(s):
    expresion = ['([0-9]+)(\sde\s)(\w+)(\sde\s)([0-9]+)', '(http(s)*://)([^\s]*)', '([0-9]{1,2})(\/|-|:)([0-9]{1,2})(\/|-|:)([0-9]{4}|[0-9]{2})',
                  '([^\s]+@[^\s]+)', '([@|#]([^\s]+))', '([À-úA-Z]+\.)+', '([0-9])+(\.|,|\-|\:|\/)([0-9]+)',  '([À-úa-zA-Z])+(\-)([À-úa-zA-Z]+)',
                  '([^\w\s.:;,!¡/@¿?%~\"\'\#-])', '\(', '\)', '\.\.\.', '\.', '\,', '\'', '\"', '\?', '\¿', '\!','\¡', '\;', '\:','\%','\w+']
    reg_exp = "|".join(expresion)
    compilador= re.compile(reg_exp, re.U)
    return [i.group(0)  for i in re.finditer(compilador, s)]


def load_lexicon(path):
    polarities= {}
    with open(path, 'r') as f:
        line = f.readline()
        while line:
          if not line.startswith('#') and line!="\n":
              terms = line.split()
              term = terms[0]
              if terms[-1] == 'positive':
                  polarities[term] = "POS"
              elif terms[-1]=='negative':
                  polarities[term] = "NEG"
              else:
                  polarities[term] = "NEU"
          line = f.readline()
    return polarities


def get_polarities(lexicon, data):
  result=[]
  for line in data:
    positive = 0
    negative = 0
    for word in line.split():
      polarity = lexicon.get(word, "NEU")
      if polarity == "POS":
        positive +=1
      elif polarity == "NEG":
        negative +=1
    result.append([positive, negative])
  return result


# Data preprocessing
train_x, train_y = preprocess_xml(train_path)
dev_x, dev_y = preprocess_xml(dev_path)
test_x, test_y = preprocess_xml(test_path, is_test=True)

# Vectorize the data
vectorizer = CountVectorizer(tokenizer=mi_tokenizador)
train_vectors2 = vectorizer.fit_transform(train_x)
dev_vectors2 = vectorizer.transform(dev_x)

# External resources
lexicon = load_lexicon('ElhPolar_esV1.lex')

# Get polarities of train data and append them to the train vectors
train_polaridades = get_polarities(lexicon, train_x)
train_vectors2=scipy.sparse.hstack([train_vectors2, train_polaridades])

# Train classifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

classifier_liblinear2 = make_pipeline(StandardScaler(with_mean=False), SGDClassifier(max_iter=1000, tol=1e-3))
classifier_liblinear2.fit(train_vectors2, train_y)

# Get polarities of the dev data and append them to the dev vectors
dev_polaridades = get_polarities(lexicon, dev_x)
dev_vectors2=scipy.sparse.hstack([dev_vectors2, dev_polaridades])

# Eval classifier with dev
prediction_liblinear = classifier_liblinear2.predict(dev_vectors2)
print(classification_report(dev_y, prediction_liblinear))

              precision    recall  f1-score   support

           N       0.57      0.63      0.59       219
         NEU       0.21      0.17      0.19        69
        NONE       0.23      0.21      0.22        62
           P       0.51      0.50      0.51       156

    accuracy                           0.47       506
   macro avg       0.38      0.38      0.38       506
weighted avg       0.46      0.47      0.47       506



# Modelo 4: combinación por votación del modelo 1, 2 y 3

In [16]:
m1_ypred = classifier_liblinear.predict(dev_vectors)
m2_ypred = model.predict(dev_dataset)
target_names=["N","P","NEU","NONE"]
m2_ypred = [target_names[np.argmax(i)] for i in y_hat]
m3_ypred = classifier_liblinear2.predict(dev_vectors2)

y_pred = []
for i in range(len(m1_ypred)):
  votes = np.array([m1_ypred[i], m2_ypred[i], m3_ypred[i]])
  values, counts = np.unique(votes, return_counts=True)
  idx = np.argmax(counts)
  y_pred.append(values[idx])


print(classification_report(dev_y, y_pred))

              precision    recall  f1-score   support

           N       0.51      0.90      0.65       219
         NEU       0.27      0.04      0.08        69
        NONE       0.46      0.10      0.16        62
           P       0.65      0.39      0.49       156

    accuracy                           0.53       506
   macro avg       0.47      0.36      0.34       506
weighted avg       0.51      0.53      0.46       506

