In [1]:
import numpy as np
import os
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

In [2]:
# Read labels
data_filepath = 'data/raw'
labels = dict()
with open(os.path.join(data_filepath, 'Y', 'truth.txt')) as f:
    for line in f.readlines():
        user_id, label = tuple(line.split(':::'))
        labels[user_id] = label.strip()

In [3]:
len(labels)

420

In [4]:
tweets = dict()
xml_filenames = [x[2] for x in os.walk(os.path.join(data_filepath, 'X'))][0]
for filename in xml_filenames: # filename is user id
    user_id = filename.split('.')[0]
    tweets[user_id] = []
    with open(os.path.join(data_filepath, 'X', filename), encoding='utf-8') as f:
        content = f.read()
        bs_data = BeautifulSoup(content, 'xml')
        documents = [x.getText() for x in bs_data.find_all('document')]
        tweets[user_id].extend(documents)

In [5]:
for user_id in tweets:
    tweets[user_id] = ' . '.join(tweets[user_id])
    tweets[user_id] = tweets[user_id].replace('\n', ' ')
    tweets[user_id] = tweets[user_id].replace('\t', ' ')

In [6]:
len(tweets)

420

In [7]:
user_indices = {}
i = 0
for key in tweets.keys():
    user_indices[i] = key
    i += 1

In [8]:
X = [tweets[user_indices[i]] for i in range(len(tweets))]
y = [1 if labels[user_indices[i]] == 'I' else 0 for i in range(len(labels))]

In [9]:
X = np.asarray(X, dtype=object)
y = np.asarray(y, dtype=int)

In [10]:
X.shape

(420,)

In [11]:
y.shape

(420,)

In [12]:
X[0]

'Midland Schools Not Putting Litter Boxes In Bathrooms For Furries, District Says |\xa0#HASHTAG# #URL# . MBB Preview: Arkansas hosts Texas A&amp;M |\xa0#HASHTAG# #URL# . ‘Spread Cream Cheese, Not Hate’: University of Florida Students Give Away Free Bagels to Fight Antisemitism | #HASHTAG# |\xa0#HASHTAG# #URL# . From summer student to chief administrative officer, Deborah Martin-Downs completes the circle | #HASHTAG# |\xa0#HASHTAG# #URL# . Bucks hold off Kings, 133-127 | Basketball |\xa0#HASHTAG# #URL# . #HASHTAG# | How will the COVID-19 pandemic end? WHO’s answer might surprise\xa0you #URL# . Two Ulster County child molesters sentenced to prison |\xa0#HASHTAG# #URL# . #HASHTAG# | How QAnon sex trafficking conspiracy theories forced a butterfly sanctuary to shut\xa0down #URL# . More than 1,000 students were sexually abused at this university. An ex-NFL player wants their stories to be heard | Crime | #HASHTAG#. |\xa0#HASHTAG# #URL# . Supporters of Laurentian green space continue to figh

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [14]:
len(X_train)

294

In [15]:
len(X_test)

63

In [16]:
len(X_dev)

63

In [17]:
def save_data(X, y, filename):
    with open(filename, 'w') as f:
        for i in range(len(X)):
            f.write(str(y[i]) + '\t' + X[i] + '\n')

In [18]:
save_data(X_train, y_train, 'data/profile_BERT_FUL/train.csv')
save_data(X_dev, y_dev, 'data/profile_BERT_FUL/dev.csv')
save_data(X_test, y_test, 'data/profile_BERT_FUL/test.csv')