In [1]:
!unzip /content/pan22-author-profiling-training-2022-03-29.zip > /dev/null 2>&1
!unzip /content/pan22-author-profiling-test-2022-04-22-without_truth.zip > /dev/null 2>&1

In [2]:
import os
import re
import pickle
import numpy as np
from tqdm import tqdm

import xml.etree.ElementTree as ET

from sklearn.metrics import classification_report

In [3]:
def tokenize_phrase(s, as_list=True):
  xx=[]
  x=s.split()
  for t in x:
    t = t.lower()
    t = re.sub( '#user#', "@USER", t )
    t = re.sub( '#url#', "HTTPURL", t )
    t = re.sub( '#hashtag#', "", t )

    xx.append(t)

  if not as_list:
    return " ".join(xx)
  return (xx)

In [4]:
train_directory = '/content/pan22-author-profiling-training-2022-03-29/en/'
test_directory = '/content/pan22-author-profiling-test-2022-04-22-without_truth/en/'

def process_truth(path):
  ground_truth = dict()
  with open(path, 'r') as f:
    for line in f:
      elems = line.split(":::")
      user_id = elems[0]
      user_class = elems[1].replace("\n", "")
      ground_truth[user_id] = user_class
  return ground_truth

def get_data_from_xml_file(path):
  tree = ET.parse(path)
  root = tree.getroot()[0]

  user_id = path.split("/")[-1].split(".")[0]
  user_tweets = list()
  for tweet_xml in root:
    tweet = tweet_xml.text
    user_tweets.append( tweet )

  return user_id, user_tweets


train_users_tweets = dict()
for filename in os.listdir(train_directory):
    f = os.path.join(train_directory, filename)
    if os.path.isfile(f) and filename.endswith('.xml'):
        user_id, user_tweets = get_data_from_xml_file(f)
        aux = []
        for tweet in user_tweets:
          aux.append(tokenize_phrase(tweet, as_list=False))
        train_users_tweets[user_id] = aux
    elif os.path.isfile(f):
      ground_truth = process_truth(f)

assert len(train_users_tweets) == len(ground_truth)

n_train_users = len(train_users_tweets)


test_users_tweets = dict()
for filename in os.listdir(test_directory):
    f = os.path.join(test_directory, filename)
    if os.path.isfile(f) and filename.endswith('.xml'):
        user_id, user_tweets = get_data_from_xml_file(f)
        aux = []
        for tweet in user_tweets:
          aux.append(tokenize_phrase(tweet, as_list=False))
        test_users_tweets[user_id] = aux

n_test_users = len(test_users_tweets)

In [5]:
print(n_train_users, n_test_users)

420 180


In [13]:
# Process data, so every user is a string of his tweets and his class

train_data = list()
train_labels = list()

for k in train_users_tweets.keys():
    train_data.append( "\n".join(train_users_tweets[k]) )
    train_labels.append(ground_truth[k])

test_data = list()
test_users = list()

for k in sorted(test_users_tweets.keys()):
    test_data.append( "\n".join(test_users_tweets[k]) )
    test_users.append(k)

In [14]:
import re

def mi_tokenizador(s, as_list=True):
  xx=[]
  x=s.split()
  for t in x:
    t = t.lower()

    t = re.sub( '#user#', "", t )
    t = re.sub( '#url#', "", t )
    t = re.sub( '#hashtag#', "", t )

    t = re.sub( "[0-9]{1,2}[-/]{1}[0-9]{1,2}[-/]{1}[0-9]{2,4}", "date", t ) # numeric dates
    t = re.sub( "[0-9]+[.,/\-:]{1}[0-9]+|[0-9]+" , "num", t ) # all numbers
    xx.append(t)

  if not as_list:
    return " ".join(xx)
  return (xx)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import svm

vectorizer = CountVectorizer(tokenizer=mi_tokenizador)

train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

classifier_liblinear = svm.LinearSVC(C=100000, max_iter=1000000)

classifier_liblinear.fit(train_vectors, train_labels)
preds = classifier_liblinear.predict(test_vectors)

In [16]:
print(preds)

['NI' 'NI' 'NI' 'NI' 'NI' 'I' 'NI' 'I' 'I' 'I' 'NI' 'I' 'NI' 'I' 'I' 'NI'
 'NI' 'NI' 'I' 'I' 'I' 'NI' 'NI' 'I' 'I' 'I' 'NI' 'I' 'I' 'NI' 'I' 'NI'
 'I' 'NI' 'NI' 'NI' 'NI' 'I' 'NI' 'NI' 'I' 'I' 'I' 'NI' 'NI' 'I' 'I' 'NI'
 'I' 'I' 'I' 'I' 'I' 'I' 'I' 'NI' 'I' 'I' 'NI' 'I' 'NI' 'I' 'NI' 'NI' 'NI'
 'I' 'I' 'I' 'I' 'NI' 'NI' 'I' 'I' 'I' 'I' 'NI' 'I' 'NI' 'I' 'NI' 'NI' 'I'
 'NI' 'I' 'I' 'I' 'NI' 'NI' 'I' 'I' 'I' 'I' 'I' 'I' 'NI' 'NI' 'NI' 'NI'
 'NI' 'NI' 'NI' 'NI' 'I' 'NI' 'NI' 'I' 'I' 'NI' 'I' 'NI' 'NI' 'I' 'I' 'I'
 'I' 'NI' 'I' 'NI' 'I' 'NI' 'I' 'NI' 'I' 'I' 'I' 'NI' 'NI' 'NI' 'I' 'NI'
 'I' 'I' 'NI' 'NI' 'NI' 'NI' 'I' 'I' 'I' 'NI' 'I' 'NI' 'I' 'NI' 'NI' 'NI'
 'I' 'I' 'I' 'I' 'I' 'NI' 'NI' 'I' 'NI' 'NI' 'I' 'NI' 'NI' 'I' 'I' 'I' 'I'
 'NI' 'NI' 'NI' 'I' 'NI' 'NI' 'I' 'I' 'NI' 'NI' 'NI' 'I' 'I' 'I' 'NI' 'I'
 'I']


In [22]:
INT_TO_CLASS_LABEL = ["NI", "I"]
CLASS_LABEL_TO_INT = {"NI": 0, "I": 1}

test_counts = [0, 0]

out_file = open("test_truth_final.txt", "w")
for index in range(len(test_users)):
  test_counts[ CLASS_LABEL_TO_INT[preds[index]] ] += 1
  out_file.write(f"{test_users[index]}:::{preds[index]}\n")
out_file.close()

print(f"Hemos predecido un total de {test_counts[0]} usuarios no irónicos y {test_counts[1]} irónicos.")

Hemos predecido un total de 85 usuarios no irónicos y 95 irónicos.


In [23]:
import os
import xml.etree.cElementTree as ET

INT_TO_CLASS_LABEL = ["NI", "I"]
test_users_predicts = dict()
for index in range(len(test_users)):
  test_users_predicts[test_users[index]] = preds[index]

test_result_directory = "./truth_final"
if not os.path.isdir(test_result_directory):
  os.mkdir(test_result_directory)

for test_user, classif_class in test_users_predicts.items():
  out_user = open( os.path.join(test_result_directory, f"{test_user}.xml") , "w")
  out_user.write(f'<author id="{test_user}"\nlang="en"\ntype="{classif_class}"\n/>')
  out_user.close()

In [24]:
!ls ./truth_final | wc -l

180


In [25]:
!zip -r truth_final.zip ./truth

  adding: truth/ (stored 0%)
  adding: truth/676ac6f0f8dc64239691d8052409a54c.xml (stored 0%)
  adding: truth/edaecf3e231cbefe0a7af13a6b92a21f.xml (stored 0%)
  adding: truth/ffc409cdad92bf2d15f4a9e18b250b31.xml (stored 0%)
  adding: truth/bfa500f8b9ccbe67ad0665ce4d542684.xml (stored 0%)
  adding: truth/ba5c2b0d20b249842b210ccdb6b4c3e4.xml (stored 0%)
  adding: truth/8158fecf0e49ac5f3cc14b821340a9e7.xml (stored 0%)
  adding: truth/3f016b5c991cdf30d875bdd7169cf345.xml (stored 0%)
  adding: truth/d6006abecde419c59ba33532dd533e2b.xml (stored 0%)
  adding: truth/740c19c02e8aefb4ee35febedd3777a2.xml (stored 0%)
  adding: truth/c554db6744829b2a0a04d8b774a5edcb.xml (stored 0%)
  adding: truth/a59360a9459f133a101abd69393b29eb.xml (stored 0%)
  adding: truth/e997fcccff44cb6744640e8b9d187a15.xml (stored 0%)
  adding: truth/2550b4037ef6eef0b337d9146b13fa89.xml (stored 0%)
  adding: truth/691bbc00ee1d62d71a5a6115cf356600.xml (stored 0%)
  adding: truth/65f1a4dbac4543ca70b201c9b6cb098e.xml (stored 