In [1]:
!unzip /content/pan22-author-profiling-training-2022-03-29.zip > /dev/null 2>&1
!unzip /content/pan22-author-profiling-test-2022-04-22-without_truth.zip > /dev/null 2>&1

In [2]:
import os
import re
import pickle
import numpy as np
from tqdm import tqdm

import xml.etree.ElementTree as ET

import tensorflow as tf

from sklearn.metrics import classification_report

In [3]:
def tokenize_phrase(s, as_list=True):
  xx=[]
  x=s.split()
  for t in x:
    t = t.lower()
    t = re.sub( '#user#', "@USER", t )
    t = re.sub( '#url#', "HTTPURL", t )
    t = re.sub( '#hashtag#', "", t )

    xx.append(t)

  if not as_list:
    return " ".join(xx)
  return (xx)

In [4]:
train_directory = '/content/pan22-author-profiling-training-2022-03-29/en/'
test_directory = '/content/pan22-author-profiling-test-2022-04-22-without_truth/en/'

def process_truth(path):
  ground_truth = dict()
  with open(path, 'r') as f:
    for line in f:
      elems = line.split(":::")
      user_id = elems[0]
      user_class = elems[1].replace("\n", "")
      ground_truth[user_id] = user_class
  return ground_truth

def get_data_from_xml_file(path):
  tree = ET.parse(path)
  root = tree.getroot()[0]

  user_id = path.split("/")[-1].split(".")[0]
  user_tweets = list()
  for tweet_xml in root:
    tweet = tweet_xml.text
    user_tweets.append( tweet )

  return user_id, user_tweets


train_users_tweets = dict()
for filename in os.listdir(train_directory):
    f = os.path.join(train_directory, filename)
    if os.path.isfile(f) and filename.endswith('.xml'):
        user_id, user_tweets = get_data_from_xml_file(f)
        aux = []
        for tweet in user_tweets:
          aux.append(tokenize_phrase(tweet, as_list=False))
        train_users_tweets[user_id] = aux
    elif os.path.isfile(f):
      ground_truth = process_truth(f)

assert len(train_users_tweets) == len(ground_truth)

n_train_users = len(train_users_tweets)


test_users_tweets = dict()
for filename in os.listdir(test_directory):
    f = os.path.join(test_directory, filename)
    if os.path.isfile(f) and filename.endswith('.xml'):
        user_id, user_tweets = get_data_from_xml_file(f)
        aux = []
        for tweet in user_tweets:
          aux.append(tokenize_phrase(tweet, as_list=False))
        test_users_tweets[user_id] = aux

n_test_users = len(test_users_tweets)

In [5]:
print(n_train_users, n_test_users)

420 180


In [6]:
bert_train_user_vectors = pickle.load(open("/content/TweetBERT_users_dict.p", "rb") )
bert_test_user_vectors = pickle.load(open("/content/TweetBERT_test_users_dict.p", "rb") )

In [7]:
import torch

train_user_mean_embeddings = {}
for user, vectors in sorted(bert_train_user_vectors.items()):
    X = torch.stack(vectors)
    mean = torch.mean(X, dim=0)
    train_user_mean_embeddings[user] = mean.tolist()[0]

test_user_mean_embeddings = {}
for user, vectors in sorted(bert_test_user_vectors.items()):
    X = torch.stack(vectors)
    mean = torch.mean(X, dim=0)
    test_user_mean_embeddings[user] = mean.tolist()[0]

In [8]:
import keras
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.optimizers import Adam, SGD, Adadelta
from keras.preprocessing.image import ImageDataGenerator

DEBUG = True

### MODEL DEFINITION
num_classes = 2
input_layer = Input(shape=(768,))

layer_1 = Dense(512, activation='relu')(input_layer)
layer_2 = Dense(256, activation='relu')(layer_1)
layer_3 = Dense(64, activation='relu')(layer_2)

output_layer = Dense(num_classes, activation='softmax')(layer_3)

train_user_ids = list(train_users_tweets.keys())

CLASS_LABEL_TO_INT = { "NI": 0, "I": 1 }


train_data = list()
train_labels = list()

for k in train_user_ids:
    train_data.append( train_user_mean_embeddings[k] )
    train_labels.append( CLASS_LABEL_TO_INT[ground_truth[k]] )

train_data = np.array(train_data, dtype=object)

train_labels = keras.utils.np_utils.to_categorical(train_labels, 2)

train_data = np.asarray(train_data).astype(np.float32)
train_labels = np.asarray(train_labels).astype(np.float32)


model = Model(inputs=input_layer, outputs=output_layer)
epochs = 100
batch_size = 32

bestModelPath = f'./irony_profiling_model.hdf5'

save_best_model = ModelCheckpoint(
    filepath=bestModelPath,
    monitor='loss',
    verbose=0,
    save_best_only=True
)

dynamic_lr = ReduceLROnPlateau(monitor='loss', patience=10, min_lr=1e-9)

adam = Adam(learning_rate=1e-3)

model.compile(
    loss='categorical_crossentropy',
    optimizer=adam,
    metrics=['accuracy']
)

print(f"Start training")
## TRAINING
history = model.fit(
      train_data,
      train_labels,
      steps_per_epoch=len(train_data) / batch_size, 
      epochs=epochs,
      callbacks=[dynamic_lr, save_best_model],
      verbose=0
)

Start training


In [9]:
test_users = list()
test_data = list()
for k, v in test_user_mean_embeddings.items():
  test_users.append(k)
  test_data.append(v)

test_data = np.array(test_data, dtype=object)
test_data = np.asarray(test_data).astype(np.float32)

bert_preds = model.predict(test_data)

In [11]:
INT_TO_CLASS_LABEL = ["NI", "I"]

out_file = open("test_truth_conf.txt", "w")
for index in range(len(test_users)):
  out_file.write(f"{test_users[index]}\t->\tNI: {bert_preds[index][0]}\t|\tI: {bert_preds[index][1]}\n")
out_file.close()

test_counts = [0, 0]

out_file = open("test_truth.txt", "w")
for index in range(len(test_users)):
  test_counts[np.argmax(bert_preds[index])] += 1
  out_file.write(f"{test_users[index]}:::{INT_TO_CLASS_LABEL[np.argmax(bert_preds[index])]}\n")
out_file.close()

print(f"Hemos predecido un total de {test_counts[0]} usuarios no irónicos y {test_counts[1]} irónicos.")

Hemos predecido un total de 88 usuarios no irónicos y 92 irónicos.


In [19]:
import os
import xml.etree.cElementTree as ET

INT_TO_CLASS_LABEL = ["NI", "I"]
test_users_predicts = dict()
for index in range(len(test_users)):
  test_users_predicts[test_users[index]] = INT_TO_CLASS_LABEL[np.argmax(bert_preds[index])]

test_result_directory = "./truth_final_bert"
if not os.path.isdir(test_result_directory):
  os.mkdir(test_result_directory)

for test_user, classif_class in test_users_predicts.items():
  out_user = open( os.path.join(test_result_directory, f"{test_user}.xml") , "w")
  out_user.write(f'<author id="{test_user}"\nlang="en"\ntype="{classif_class}"\n/>')
  out_user.close()

In [20]:
!ls ./truth_final_bert | wc -l

180


In [21]:
!zip -r truth_final_bert.zip ./truth_final_bert

  adding: truth_final_bert/ (stored 0%)
  adding: truth_final_bert/676ac6f0f8dc64239691d8052409a54c.xml (stored 0%)
  adding: truth_final_bert/edaecf3e231cbefe0a7af13a6b92a21f.xml (stored 0%)
  adding: truth_final_bert/ffc409cdad92bf2d15f4a9e18b250b31.xml (stored 0%)
  adding: truth_final_bert/bfa500f8b9ccbe67ad0665ce4d542684.xml (stored 0%)
  adding: truth_final_bert/ba5c2b0d20b249842b210ccdb6b4c3e4.xml (stored 0%)
  adding: truth_final_bert/8158fecf0e49ac5f3cc14b821340a9e7.xml (stored 0%)
  adding: truth_final_bert/3f016b5c991cdf30d875bdd7169cf345.xml (stored 0%)
  adding: truth_final_bert/d6006abecde419c59ba33532dd533e2b.xml (stored 0%)
  adding: truth_final_bert/740c19c02e8aefb4ee35febedd3777a2.xml (stored 0%)
  adding: truth_final_bert/c554db6744829b2a0a04d8b774a5edcb.xml (stored 0%)
  adding: truth_final_bert/a59360a9459f133a101abd69393b29eb.xml (stored 0%)
  adding: truth_final_bert/e997fcccff44cb6744640e8b9d187a15.xml (stored 0%)
  adding: truth_final_bert/2550b4037ef6eef0b337d