# Preprocess Datasets

Datasets:
- Davison et al
- Storm front
- ...

Preprocessing:
- Make binary classification **0:no_hate 1: hate** 
- Removal of punctuation and capitlization
- Tokenizing
- Removal of stopwords
- Stemming

In [9]:
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import string
import nltk

raw_data_path = '../data/raw/'
processed_data_path = '../data/processed/'

In [10]:
## 1. Removal of punctuation and capitlization
## 2. Tokenizing
## 3. Removal of stopwords
## 4. Stemming

stopwords = nltk.corpus.stopwords.words("english")

#extending the stopwords to include other words used in twitter such as retweet(rt) etc.
other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)
stemmer = PorterStemmer()

def preprocess(tweet):  
    
    # removal of extra spaces
    regex_pat = re.compile(r'\s+')
    tweet_space = tweet.str.replace(regex_pat, ' ')

    # removal of @name[mention]
    regex_pat = re.compile(r'@[\w\-]+')
    tweet_name = tweet_space.str.replace(regex_pat, '')

    # removal of links[https://abc.com]
    giant_url_regex =  re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    tweets = tweet_name.str.replace(giant_url_regex, '')
    
    # removal of punctuations and numbers
    punc_remove = tweets.str.replace("[^a-zA-Z]", " ")
    # remove whitespace with a single space
    newtweet=punc_remove.str.replace(r'\s+', ' ')
    # remove leading and trailing whitespace
    newtweet=newtweet.str.replace(r'^\s+|\s+?$','')
    # replace normal numbers with numbr
    newtweet=newtweet.str.replace(r'\d+(\.\d+)?','numbr')
    # removal of capitalization
    tweet_lower = newtweet.str.lower()
    
    # tokenizing
    tokenized_tweet = tweet_lower.apply(lambda x: x.split())
    
    # removal of stopwords
    tokenized_tweet=  tokenized_tweet.apply(lambda x: [item for item in x if item not in stopwords])
    
    # stemming of the tweets
    tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) 
    
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
        tweets_p= tokenized_tweet
    
    return tweets_p

----------------

# Davidson

In [3]:
# Pre process Davidson dataset 
dataset = 'davison.csv'
df = pd.read_csv(raw_data_path + dataset)

**Class Label:** 
- 0 - hate speech 
- 1 - offensive language 
- 2 - neither

In [4]:
# Preprocess tweets
processed_tweets = preprocess(df.tweet)   
df['processed_tweet'] = processed_tweets

In [5]:
df['hate_speech']        = df['hate_speech'] >= (df['count']/2)
df['offensive_language'] = df['offensive_language'] >= (df['count']/2)
df['neither']            = df['neither'] >= (df['count']/2)

# only rows where only one of three classes = true
df = df[(df['hate_speech'].astype(int) + df['offensive_language'].astype(int) + df['neither'].astype(int)) == 1]

# Binarize class 0,1 -> 1 and 2 -> 0
df['class'] = df['class'].map({0:1, 1:1, 2:2})
df['class'] = df['class'].map({1:1, 2:0})

In [7]:
# Keep only tweet and class
df[['tweet', 'processed_tweet', 'class']].to_csv(processed_data_path + dataset, index=False)

-----------------

# founta et al

In [3]:
dataset = 'founta.csv'
df = pd.read_csv(raw_data_path + dataset)

**Class Label:** 
- normal -> 0
- spam -> 0
- abusive -> 1
- hateful -> 1

In [5]:
# Binarize class 0,1 -> 1 and 2 -> 0
df = df.rename(columns={'label':'class', 'post':'tweet'})
df['class'] = df['class'].map({'normal':0, 'spam':0, 'abusive':1, 'hateful':1})

In [7]:
# Preprocess tweets
processed_tweets = preprocess(df.tweet)   
df['processed_tweet'] = processed_tweets

In [8]:
# Keep only tweet and class
df[['tweet', 'processed_tweet', 'class']].to_csv(processed_data_path + dataset, index=False)

----------

# Waseem & Hovy

In [11]:
dataset = 'waseem.csv'
df = pd.read_csv(raw_data_path + dataset)

**Class Label:** 
- none -> 0
- sexism -> 1
- racism -> 1

In [16]:
# Binarize class 0,1 -> 1 and 2 -> 0
df = df.rename(columns={'label':'class'})
df['class'] = df['class'].map({'none\n':0, 'sexism\n':1, 'racism\n':1})

In [17]:
# Preprocess tweets
processed_tweets = preprocess(df.tweet)   
df['processed_tweet'] = processed_tweets

In [18]:
# Keep only tweet and class
df[['tweet', 'processed_tweet', 'class']].to_csv(processed_data_path + dataset, index=False)

----------

# Stormfront

**Class Label:** 
- hate
- noHate

In [12]:
dataset = 'stormfront.csv'
df = pd.read_csv(raw_data_path + dataset)

In [14]:
# Binarize class 0,1 -> 1 and 2 -> 0
df = df.rename(columns={'label':'class', 'post':'tweet'})
df['class'] = df['class'].map({'noHate':0, 'hate':1})

In [16]:
# Preprocess tweets
processed_tweets = preprocess(df.tweet)   
df['processed_tweet'] = processed_tweets

In [17]:
# Keep only tweet and class
df[['tweet', 'processed_tweet', 'class']].to_csv(processed_data_path + dataset, index=False)