In [1]:
!unzip /content/pan22-author-profiling-training-2022-03-29.zip > /dev/null 2>&1
!unzip /content/pan22-author-profiling-test-2022-04-22-without_truth.zip > /dev/null 2>&1

In [2]:
import os
import re
import pickle
import numpy as np
from tqdm import tqdm

import xml.etree.ElementTree as ET

import tensorflow as tf

from sklearn.metrics import classification_report

In [3]:
def tokenize_phrase(s, as_list=True):
  xx=[]
  x=s.split()
  for t in x:
    t = t.lower()
    t = re.sub( '#user#', "@USER", t )
    t = re.sub( '#url#', "HTTPURL", t )
    t = re.sub( '#hashtag#', "", t )

    xx.append(t)

  if not as_list:
    return " ".join(xx)
  return (xx)

In [4]:
train_directory = '/content/pan22-author-profiling-training-2022-03-29/en/'
test_directory = '/content/pan22-author-profiling-test-2022-04-22-without_truth/en/'

def process_truth(path):
  ground_truth = dict()
  with open(path, 'r') as f:
    for line in f:
      elems = line.split(":::")
      user_id = elems[0]
      user_class = elems[1].replace("\n", "")
      ground_truth[user_id] = user_class
  return ground_truth

def get_data_from_xml_file(path):
  tree = ET.parse(path)
  root = tree.getroot()[0]

  user_id = path.split("/")[-1].split(".")[0]
  user_tweets = list()
  for tweet_xml in root:
    tweet = tweet_xml.text
    user_tweets.append( tweet )

  return user_id, user_tweets


train_users_tweets = dict()
for filename in os.listdir(train_directory):
    f = os.path.join(train_directory, filename)
    if os.path.isfile(f) and filename.endswith('.xml'):
        user_id, user_tweets = get_data_from_xml_file(f)
        aux = []
        for tweet in user_tweets:
          aux.append(tokenize_phrase(tweet, as_list=False))
        train_users_tweets[user_id] = aux
    elif os.path.isfile(f):
      ground_truth = process_truth(f)

assert len(train_users_tweets) == len(ground_truth)

n_train_users = len(train_users_tweets)


test_users_tweets = dict()
for filename in os.listdir(test_directory):
    f = os.path.join(test_directory, filename)
    if os.path.isfile(f) and filename.endswith('.xml'):
        user_id, user_tweets = get_data_from_xml_file(f)
        aux = []
        for tweet in user_tweets:
          aux.append(tokenize_phrase(tweet, as_list=False))
        test_users_tweets[user_id] = aux

n_test_users = len(test_users_tweets)

In [5]:
print(n_train_users, n_test_users)

420 180


# BERT Model

In [6]:
bert_train_user_vectors = pickle.load(open("/content/TweetBERT_users_dict.p", "rb") )
bert_test_user_vectors = pickle.load(open("/content/TweetBERT_test_users_dict.p", "rb") )

In [7]:
import torch

train_user_mean_embeddings = {}
for user, vectors in sorted(bert_train_user_vectors.items()):
    X = torch.stack(vectors)
    mean = torch.mean(X, dim=0)
    train_user_mean_embeddings[user] = mean.tolist()[0]

test_user_mean_embeddings = {}
for user, vectors in sorted(bert_test_user_vectors.items()):
    X = torch.stack(vectors)
    mean = torch.mean(X, dim=0)
    test_user_mean_embeddings[user] = mean.tolist()[0]

In [8]:
import keras
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.optimizers import Adam, SGD, Adadelta
from keras.preprocessing.image import ImageDataGenerator

DEBUG = True

### MODEL DEFINITION
num_classes = 2
input_layer = Input(shape=(768,))

layer_1 = Dense(512, activation='relu')(input_layer)
layer_2 = Dense(256, activation='relu')(layer_1)
layer_3 = Dense(64, activation='relu')(layer_2)

output_layer = Dense(num_classes, activation='softmax')(layer_3)

train_user_ids = list(train_users_tweets.keys())

CLASS_LABEL_TO_INT = { "NI": 0, "I": 1 }


train_data = list()
train_labels = list()

for k in train_user_ids:
    train_data.append( train_user_mean_embeddings[k] )
    train_labels.append( CLASS_LABEL_TO_INT[ground_truth[k]] )

train_data = np.array(train_data, dtype=object)

train_labels = keras.utils.np_utils.to_categorical(train_labels, 2)

train_data = np.asarray(train_data).astype(np.float32)
train_labels = np.asarray(train_labels).astype(np.float32)


model = Model(inputs=input_layer, outputs=output_layer)
epochs = 100
batch_size = 32

bestModelPath = f'./irony_profiling_model.hdf5'

save_best_model = ModelCheckpoint(
    filepath=bestModelPath,
    monitor='loss',
    verbose=0,
    save_best_only=True
)

dynamic_lr = ReduceLROnPlateau(monitor='loss', patience=10, min_lr=1e-9)

adam = Adam(learning_rate=1e-3)

model.compile(
    loss='categorical_crossentropy',
    optimizer=adam,
    metrics=['accuracy']
)

print(f"Start training")
## TRAINING
history = model.fit(
      train_data,
      train_labels,
      steps_per_epoch=len(train_data) / batch_size, 
      epochs=epochs,
      callbacks=[dynamic_lr, save_best_model],
      verbose=0
)

Start training


In [9]:
test_users = list()
test_data = list()
for k, v in test_user_mean_embeddings.items():
  test_users.append(k)
  test_data.append(v)

test_data = np.array(test_data, dtype=object)
test_data = np.asarray(test_data).astype(np.float32)

bert_preds = model.predict(test_data)
bert_int_preds = list()
for index in range(len(test_users)):
  bert_int_preds.append( np.argmax(bert_preds[index]) )


# Vectorizer Models

In [11]:
# Process data, so every user is a string of his tweets and his class

train_data = list()
train_labels = list()

for k in train_users_tweets.keys():
    train_data.append( "\n".join(train_users_tweets[k]) )
    train_labels.append(ground_truth[k])

test_data = list()
test_users = list()

for k in sorted(test_users_tweets.keys()):
    test_data.append( "\n".join(test_users_tweets[k]) )
    test_users.append(k)

In [12]:
import re

def mi_tokenizador(s, as_list=True):
  xx=[]
  x=s.split()
  for t in x:
    t = t.lower()

    t = re.sub( '#user#', "", t )
    t = re.sub( '#url#', "", t )
    t = re.sub( '#hashtag#', "", t )

    t = re.sub( "[0-9]{1,2}[-/]{1}[0-9]{1,2}[-/]{1}[0-9]{2,4}", "date", t ) # numeric dates
    t = re.sub( "[0-9]+[.,/\-:]{1}[0-9]+|[0-9]+" , "num", t ) # all numbers
    xx.append(t)

  if not as_list:
    return " ".join(xx)
  return (xx)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import svm

vectorizer = CountVectorizer(tokenizer=mi_tokenizador)

train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

classifier_liblinear = svm.LinearSVC(C=100000, max_iter=1000000)

classifier_liblinear.fit(train_vectors, train_labels)
count_preds = classifier_liblinear.predict(test_vectors)

In [14]:
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn import svm

vectorizer = HashingVectorizer(tokenizer=mi_tokenizador)

train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

classifier_liblinear = svm.LinearSVC(C=1000, max_iter=1000000)

classifier_liblinear.fit(train_vectors, train_labels)
hashing_preds = classifier_liblinear.predict(test_vectors)

  "The parameter 'token_pattern' will not be used"


In [15]:
print(bert_preds, count_preds, hashing_preds)

[[9.88271534e-01 1.17284954e-02]
 [9.85161185e-01 1.48387924e-02]
 [9.89165366e-01 1.08345849e-02]
 [9.56241131e-01 4.37588617e-02]
 [9.45613980e-01 5.43860011e-02]
 [4.72614765e-02 9.52738464e-01]
 [9.98275518e-01 1.72447297e-03]
 [1.80474634e-03 9.98195231e-01]
 [2.63419356e-02 9.73658144e-01]
 [1.09286755e-02 9.89071369e-01]
 [9.34101105e-01 6.58988804e-02]
 [8.52912385e-03 9.91470873e-01]
 [9.82231200e-01 1.77687611e-02]
 [1.20689208e-03 9.98793125e-01]
 [5.49547095e-03 9.94504511e-01]
 [9.89153326e-01 1.08466009e-02]
 [9.96900916e-01 3.09905317e-03]
 [9.76565421e-01 2.34346185e-02]
 [1.58158988e-02 9.84184146e-01]
 [9.17168558e-02 9.08283174e-01]
 [7.25807130e-01 2.74192870e-01]
 [9.95548844e-01 4.45115939e-03]
 [9.86495912e-01 1.35040488e-02]
 [6.04401249e-03 9.93956029e-01]
 [9.97602880e-01 2.39718589e-03]
 [1.51755027e-02 9.84824479e-01]
 [9.81608570e-01 1.83914732e-02]
 [2.93688639e-03 9.97063100e-01]
 [5.11316862e-03 9.94886816e-01]
 [9.85731423e-01 1.42685818e-02]
 [3.463950

In [17]:
CLASS_LABEL_TO_INT = {"NI": 0, "I": 1}

count_int_preds = list()
hashing_int_preds = list()
for index in range(len(test_users)):
  count_int_preds.append( CLASS_LABEL_TO_INT[count_preds[index]] )
  hashing_int_preds.append( CLASS_LABEL_TO_INT[hashing_preds[index]] )

print(bert_int_preds, count_int_preds, hashing_int_preds)

[0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1] [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,

In [20]:
final_preds = np.add(bert_int_preds, count_int_preds)
final_preds = np.add(final_preds, hashing_int_preds)

print(final_preds)

[0 0 0 0 0 3 0 3 3 3 0 3 0 3 3 0 0 0 3 3 2 0 0 3 2 3 0 3 3 0 3 0 2 0 0 0 0
 3 1 0 3 3 3 0 0 3 2 0 3 3 3 3 3 3 3 0 3 3 0 3 0 3 0 0 0 3 3 3 3 0 0 3 2 3
 3 0 3 0 3 0 0 3 0 3 3 3 0 0 3 3 3 3 3 3 0 0 0 0 0 0 0 0 3 0 0 3 3 0 1 0 0
 3 3 3 3 0 3 0 3 0 3 0 3 3 3 0 0 0 3 0 3 3 0 0 0 0 3 3 3 0 3 0 3 2 0 0 3 3
 3 3 3 1 0 2 0 0 3 0 0 3 3 3 3 0 0 0 2 0 0 3 3 0 0 0 3 3 2 0 3 3]


In [23]:
import os
import xml.etree.cElementTree as ET

INT_TO_CLASS_LABEL = ["NI", "I"]
test_users_predicts = dict()
for index in range(len(test_users)):
  val = final_preds[index]
  if val >= 2:
    class_str = "I"
  else:
    class_str = "NI"
  test_users_predicts[test_users[index]] = class_str

test_result_directory = "./truth_final_voting"
if not os.path.isdir(test_result_directory):
  os.mkdir(test_result_directory)

for test_user, classif_class in test_users_predicts.items():
  out_user = open( os.path.join(test_result_directory, f"{test_user}.xml") , "w")
  out_user.write(f'<author id="{test_user}"\nlang="en"\ntype="{classif_class}"\n/>')
  out_user.close()

In [22]:
!ls ./truth_final_voting | wc -l

180


In [24]:
!zip -r truth_final_voting.zip ./truth_final_voting

  adding: truth_final_voting/ (stored 0%)
  adding: truth_final_voting/676ac6f0f8dc64239691d8052409a54c.xml (stored 0%)
  adding: truth_final_voting/edaecf3e231cbefe0a7af13a6b92a21f.xml (stored 0%)
  adding: truth_final_voting/ffc409cdad92bf2d15f4a9e18b250b31.xml (stored 0%)
  adding: truth_final_voting/bfa500f8b9ccbe67ad0665ce4d542684.xml (stored 0%)
  adding: truth_final_voting/ba5c2b0d20b249842b210ccdb6b4c3e4.xml (stored 0%)
  adding: truth_final_voting/8158fecf0e49ac5f3cc14b821340a9e7.xml (stored 0%)
  adding: truth_final_voting/3f016b5c991cdf30d875bdd7169cf345.xml (stored 0%)
  adding: truth_final_voting/d6006abecde419c59ba33532dd533e2b.xml (stored 0%)
  adding: truth_final_voting/740c19c02e8aefb4ee35febedd3777a2.xml (stored 0%)
  adding: truth_final_voting/c554db6744829b2a0a04d8b774a5edcb.xml (stored 0%)
  adding: truth_final_voting/a59360a9459f133a101abd69393b29eb.xml (stored 0%)
  adding: truth_final_voting/e997fcccff44cb6744640e8b9d187a15.xml (stored 0%)
  adding: truth_final_