# Gold standard dataset creation

@Author: Siôn William Davies


Creation of the Gold dataset.

In [1]:
import pandas as pd
import numpy as np
import re
import string 
import nltk
import inflect
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
# Upload the csv file with the data.

In [3]:
df = pd.read_csv('/Users/siondavies/Desktop/NLP/Datasets/Original_Datasets/Gold_1.csv')

In [4]:
df.head()

Unnamed: 0,Index,Source Dataset,Message_Post,Label,Fascist_Speech,Category,Forum,String_Length,Language_ID,Character_Length
0,1,Iron_March,If you mean the dark+orange skin then that's a...,Fascist,Yes,Supremacism / Racism,Iron_March,140,en,114
1,2,Iron_March,This journal (about 25 pages) provides a good ...,Fascist,Yes,Totalitarianism / Authoritarianism,Iron_March,337,en,275
2,3,Iron_March,AS for the book you sent a link to: it is inte...,Fascist,Yes,Fascist,Iron_March,241,en,198
3,4,Iron_March,I don't know if you've seen many of my posts o...,Fascist,Yes,Supremacism / Racism,Iron_March,246,en,201
4,5,Iron_March,In defining superiority I rank it like this:\n...,Fascist,Yes,Supremacism / Racism,Iron_March,636,en,580


In [5]:
df.shape

(1164, 10)

In [6]:
# We will create another column 'Numeric_Label' which will indicate:
# 0: Non-fascist sample, 1: fascist sample

In [7]:
def converter(Fascist_Speech):
    if Fascist_Speech == 'Yes':
        return 1
    else:
        return 0

In [8]:
df['Numeric_Label'] = df['Fascist_Speech'].apply(converter)

In [9]:
# Now we create a new Gold dataset consisting only of the Message Posts and the Labels

In [10]:
gold_df = df[['Message_Post', 'Numeric_Label', 'Label']].copy()

In [11]:
gold_df.head()

Unnamed: 0,Message_Post,Numeric_Label,Label
0,If you mean the dark+orange skin then that's a...,1,Fascist
1,This journal (about 25 pages) provides a good ...,1,Fascist
2,AS for the book you sent a link to: it is inte...,1,Fascist
3,I don't know if you've seen many of my posts o...,1,Fascist
4,In defining superiority I rank it like this:\n...,1,Fascist


""

Below are the methods we will apply for preprocessing techniques to the textual data (Message_posts).
This will clean the data and normalise the text.

""

In [12]:
# Function to remove emoticons from a String. 

def remove_emoticons(data):
    emoticons = regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF" "]+", flags = re.UNICODE)
    return emoticons.sub(r'', data)


# Function to replace numerical numbers with their text counterparts.

def convert_numbers(data):
    inf = inflect.engine()
    for word in data:
        if word.isdigit():
            data = re.sub(word, inf.number_to_words(word), data)
        else:
            continue
    return data

# A function to remove stopwords from tokenized words.

def remove_stopwords(data):
    return[word for word in data if not word in stopwords.words('english')]


# This function can be used if we only want to stem the text.
# Must be applied as -> gold_df= stem(gold_df)

def stem(data):
    stemmer = nltk.stem.PorterStemmer()
    data['Message_Post'] = data['Message_Post'].apply(lambda x: [stemmer.stem(word) for word in x.split()])
    return data


# Function 1 to clean data in pre-processing steps.
# Converts String to lower case.
# Deletes text between < and > 
# Removes punctuation from text ... 
# ...(Remember this line should not be applied to the shuffled dataset.) 
# Removes URLs

def clean_data_1(data):
    data = data.lower()
    data = re.sub('<.*?>', '', data)
    data = re.sub('[%s]' % re.escape(string.punctuation), '', data)
    data = re.sub(r'http\S+', '', data)
    return data

# Function 2 to clean data in pre-processing steps.
# Removes non-sensical data.
# Removes emoticons
# clears up white space.

def clean_data_2(data):
    data = re.sub('-', ' ', data) 
    data = re.sub('\n', '', data)
    data = remove_emoticons(data)
    data = convert_numbers(data)

    data = re.sub(' +', ' ', data)
    return data   

Now to apply the preprocessing methods on gold_df

In [13]:
gold_df.isnull().sum()

Message_Post     0
Numeric_Label    0
Label            0
dtype: int64

In [14]:
# Clean the data...

gold_df['Message_Post'] = pd.DataFrame(gold_df.Message_Post.apply(clean_data_1).apply(clean_data_2))

In [15]:
# Apply Stemming on the text....

# gold_df= stem(gold_df)

In [16]:
gold_df.head()

Unnamed: 0,Message_Post,Numeric_Label,Label
0,if you mean the darkorange skin then thats a s...,1,Fascist
1,this journal about twofive pages provides a go...,1,Fascist
2,as for the book you sent a link to it is inter...,1,Fascist
3,i dont know if youve seen many of my posts on ...,1,Fascist
4,in defining superiority i rank it like thisphy...,1,Fascist


In [48]:
# SAVE the full dataset...

In [29]:
gold_df.to_csv(r'/Users/siondavies/Desktop/Temp_Datasets/Gold_cleaned_1.csv')

""

Now we split the data into training and test sets.

""

In [17]:
# X = what we want to predict / y = the target class

X = gold_df.Message_Post
y = gold_df.Numeric_Label

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True)

In [19]:
gold_train = pd.DataFrame({'Message_Post':X_train, 'Numeric_Label':y_train})

In [20]:
gold_train.head()

Unnamed: 0,Message_Post,Numeric_Label
954,agreed i hate it when enchanters are meta beca...,0
1017,brazil team was deseperately trying to copy ko...,0
810,the location of that country is massively impo...,0
407,i dont doubt the fact that there can be female...,1
719,man the left looks like a toddler in crappy pa...,0


In [21]:
gold_train.to_csv(r'/Users/siondavies/Desktop/Temp_Datasets/Gold_train_1.csv')

In [22]:
gold_test = pd.DataFrame({'Message_Post':X_test, 'Numeric_Label':y_test})

In [23]:
gold_test.head()

Unnamed: 0,Message_Post,Numeric_Label
1125,i would be on the lookout for a better girlfri...,0
1021,i’m going to come hump you,0
111,i am against such thing like the right to vote...,1
1049,the rule states any player or goalkeeper who b...,0
1146,its unfortunate that places like the ussr and ...,0


In [24]:
gold_test.to_csv(r'/Users/siondavies/Desktop/Temp_Datasets/Gold_test_1.csv')

In [25]:
gold_train.shape

(814, 2)

In [26]:
gold_test.shape

(350, 2)