In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import glob
import os
import re
import numpy as np
from pathlib import Path

In [None]:
train_label_file = Path.cwd() / 'datasets/pan/en.txt'
train_xml_files_dir =  Path.cwd() / 'datasets/pan/text_xml'
tweets_all = dict()
tweets_all_cleaned = dict()
tweets_all_df = None

gender_map = {'female':1, 'male':0}
author_labels = dict()
for line in open(train_label_file, 'r'):
    res = line.split(':::')
    author_labels[res[0]] = gender_map[res[1].rstrip('\n')]

for xml_file in train_xml_files_dir.glob('*.xml'):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    tweets_single = []
    tweets_single_cleaned = []
    for cdata in root.iter('document'):
        tweets_single.append(cdata.text)
        text = cdata.text
        text = text.encode('ascii', 'ignore').decode('ascii')
        cleaned_text = re.sub(r'@\S+|https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        tweets_single_cleaned.append(cleaned_text)
        
    path, file = os.path.split(xml_file)
    author_id = file.rstrip('.xml')
    tweets_all[author_id] = tweets_single
    tweets_all_cleaned[author_id] = tweets_single_cleaned
    number_of_tweets = len(tweets_single_cleaned)
    author_repeated = np.repeat(author_id, number_of_tweets)
    gender_repeated = np.repeat(author_labels[author_id], number_of_tweets)
    if tweets_all_df is None:
        tweets_all_df = pd.DataFrame(data={'id': author_repeated, 'text': tweets_single_cleaned, 'gender': gender_repeated})
    else:
        df = pd.DataFrame({'id':author_repeated, 'text': tweets_single_cleaned, 'gender': gender_repeated})
        tweets_all_df = pd.concat([tweets_all_df, df], ignore_index=True)
        print(tweets_all_df.shape)

In [None]:
train_data.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
test_data.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
train_data.to_csv('pan_train.csv', index=False, header=False)
test_data.to_csv('pan_test.csv', index=False, header=False)

In [None]:
from pathlib import Path
dataset_dir = Path.cwd() / 'datasets/pan'
train_df = pd.read_csv(dataset_dir / 'training_gender_text.csv', names=['Gender', 'Text'])
validation_df = pd.read_csv(dataset_dir / 'validation_gender_text.csv', names=['Gender', 'Text'])
test_df = pd.read_csv(dataset_dir / 'test_name_text_gender.csv', names=['UserId', 'Text', 'Gender'])

In [None]:
concat_df = pd.concat([train_df, validation_df], ignore_index=True, names=['Gender', 'Text'])
concat_df.head()

In [None]:
concat_df.shape

In [None]:
from sklearn.model_selection import train_test_split

# 80/20 train/validation
train_df, validation_df = train_test_split(concat_df, test_size=0.2)

In [None]:
from pathlib import Path
dataset_dir = Path.cwd() / 'datasets/pan_new'

In [None]:
train_df.to_csv(dataset_dir / 'training_gender_text.csv', index=False, header=False)
validation_df.to_csv(dataset_dir / 'validation_gender_text.csv', index=False, header=False)

In [None]:
train_len = train_df.shape[0]
print(train_len)

In [None]:
validation_len = validation_df.shape[0]

In [None]:
validation_len

In [None]:
test_len = test_df.shape[0]

In [None]:
total_size = train_len + validation_len + test_len

In [None]:
import matplotlib.pyplot as plt
labels = 'Training', 'Validation', 'Test'
sizes = [train_len/total_size, validation_len/total_size, test_len/total_size]
explode = (0, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()