# PREPARING MODEL TRAINING DATA

In [None]:
!pip install wordsegment
!pip install emoji
# Initialise relevant packages
import pandas as pd
import pickle


# Text cleaning
from html import unescape
import re
import string
import wordsegment as ws
import emoji
ws.load() # load vocab for word segmentation

Collecting wordsegment
  Downloading wordsegment-1.3.1-py2.py3-none-any.whl.metadata (7.7 kB)
Downloading wordsegment-1.3.1-py2.py3-none-any.whl (4.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wordsegment
Successfully installed wordsegment-1.3.1
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


## Load Raw Datasets

In [None]:
# load raw data
training_data = {}

training_data['davidson2017'] = pd.read_csv('davidson2017.csv', index_col=0)
training_data['founta2018'] = pd.read_csv('davidson2017.csv', names=['text', 'label', 'count_label_votes'], delimiter='\t')

## Tidy Up Data Format

In [None]:
# specific formatting

# Davidson 2017
training_data['davidson2017'].rename(columns={"class": "label", "tweet": "text"}, inplace=True, errors='ignore')

# Founta 2018
# --> already fits

In [None]:
for dataset in training_data:

    # create index column and rename to ID
    training_data[dataset].reset_index(inplace=True)
    training_data[dataset].rename(columns={'index': 'id'}, inplace=True, errors='ignore')

    # drop unneccessary columns
    training_data[dataset] = training_data[dataset][['id','text','label']]

    # tidy up column types
    training_data[dataset] = training_data[dataset].convert_dtypes()

## Perform Basic Text Cleaning

In [None]:
# Define helper function for segmenting hashtags found through regex
def regex_match_segmentation(match):
    return ' '.join(ws.segment(match.group(0)))

In [None]:
# Define function for cleaning text
def clean_text(text):

    # convert HTML codes
    text = unescape(text)

    # lowercase text
    text = text.lower()

    # replace mentions, URLs and emojis with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join(' [EMOJI] ' if (char in emoji.UNICODE_EMOJI) else char for char in text).strip()

    # find and split hashtags into words
    text = re.sub(r"#[A-Za-z0-9]+", regex_match_segmentation, text)

    # remove punctuation at beginning of string (quirk in Davidson data)
    text = text.lstrip("!")

    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')

    return text

In [None]:
# apply text cleaner to text columns for each dataset
import emoji

def clean_text(text):
    emoji_chars = set(emoji.EMOJI_DATA.keys())
    text = ''.join(' [EMOJI] ' if char in emoji_chars else char for char in text).strip()
    return text

for dataset in training_data:
    training_data[dataset]['text']=training_data[dataset].text.apply(clean_text)

## Export Multiclass Data

In [None]:
# give multiclass labels string names for clarity
# Davidson et al. (2017) --> 0 is "hate speech", 1 is "offensive language", 2 is "neither"
#training_data['davidson2017'].label.replace({0: "hateful", 1: "offensive", 2: "neither"}, inplace = True)
training_data['davidson2017']['label'] = training_data['davidson2017']['label'].astype(str) # Convert the 'label' column to string type
training_data['davidson2017']['label'].replace({
    '0': "hateful",
    '1': "offensive",
    '2': "neither"
}, inplace = True)
# print class frequencies for each dataset
for dataset in training_data:
    print(dataset)
    print(training_data[dataset].groupby('label').id.count(), '\n')

# save dictionary of cleaned datasets to pickle
pickle.dump(training_data, open('/content/training_data_multiclass.pkl','wb'))

davidson2017
label
hateful       1430
neither       4163
offensive    19190
Name: id, dtype: Int64 

founta2018
Series([], Name: id, dtype: Int64) 



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  training_data['davidson2017']['label'].replace({


## Convert to Binary Classification Task

In [None]:
# GOAL: hateful (1) and non-hateful (0)

# Davidson et al. (2017) --> "hateful", "offensive", "neither"
training_data['davidson2017'].label.replace({'hateful': 1, 'offensive': 0, 'neither': 0}, inplace = True)

# Founta et al. (2018) --> "hateful", "abusive", "normal", "spam"
training_data['founta2018'].label.replace({'hateful': 1, "abusive": 0, "normal": 0, "spam": 0}, inplace = True)

  training_data['davidson2017'].label.replace({'hateful': 1, 'offensive': 0, 'neither': 0}, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  training_data['founta2018'].label.replace({'hateful': 1, "abusive": 0, "normal": 0, "spam": 0}, inplace = True)


## Export Binary Data

In [None]:
# print class frequencies for each dataset
for dataset in training_data:
    print(dataset)
    print(training_data[dataset].groupby('label').id.count(), '\n')

# save dictionary of cleaned datasets to pickle
pickle.dump(training_data, open('/content/training_data_binary.pkl','wb'))

davidson2017
label
0    23353
1     1430
Name: id, dtype: Int64 

founta2018
Series([], Name: id, dtype: Int64) 

