In [None]:
# !python -m spacy download en_core_web_sm

## Imports

In [2]:
import pandas as pd
import numpy as np
import os
import random
import string
import re

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

import spacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Merging Data

This is done to apply preprocessing to all sequences before dividing them into a train, val and test splits.

In [3]:
PATH = "path"

In [4]:
# train = pd.read_csv(os.path.join(PATH,"train.csv"))
# dev = pd.read_csv(os.path.join(PATH,"dev.csv"))

In [5]:
# data = pd.concat([train,dev])
# data.to_csv(os.path.join(PATH,'merged.csv'), index=False)
data = pd.read_csv(os.path.join(PATH, "merged.csv"))

In [6]:
data[:100]

Unnamed: 0,sentence_id,sentence,label,solved_conflict
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,Gone are the days when they led the world in r...,SUBJ,True
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,The trend is expected to reverse as soon as ne...,OBJ,False
2,4076639c-aa56-4202-ae0f-9d9217f8da68,But there is the specious point again.,OBJ,False
3,b057c366-698e-419d-a284-9b16d835c64e,He added he wouldn’t be surprised to see a new...,OBJ,False
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,"Not less government, you see; the same amount ...",SUBJ,False
...,...,...,...,...
95,921a1f46-1494-4004-95ed-d26b907d3aae,It has been a bumpy road even to this juncture.,SUBJ,False
96,c2ca3827-247d-4302-971d-8054d32c73aa,"Socialists believe that, if everyone cannot ha...",OBJ,True
97,58d703aa-131b-40b4-8373-0c34b86bb1c7,Vučić said he regretted it but cited a deepeni...,OBJ,False
98,cb0dd788-2074-4469-a8f9-24d163db41c6,“There have been more rightwing extremists spr...,OBJ,False


## Preprocessing

The text contains the following that will be changed/removed:


*   Capitalization of words, particularly proper nouns.
*   Punctuation specifically in words like [that's], [I'm], etc.
*   There are several " ’ " instead of " ' " that need to be changed before removing punctuation
* Some number are large enough that they are separated by commas. Example: "2,000,000". When removing commas any commas removed from numbers need to be done such that the meaning of the number remains the same. Example: "2,000,000" becomes "2000000" and not "2 000 000"

### Change text to lowercase

In [7]:
def to_lower(phrase):
  return phrase.lower()

In [8]:
data["sentence"] = data["sentence"].apply(to_lower)

### Remove punctuation

In [9]:
#Remove all punctuation except words that have [']
#Punctuation not removed $, %, -, .
def remove_punc(text):
    # Replace Unicode character ’ with '
    text = re.sub('’', "'", text)

    # Remove commas from numbers without introducing spaces
    text = re.sub(r'(\d),(\d)', r'\1\2', text)

    # Keep $, %, -, . and words with 's; replace other punctuation with spaces
    pattern = r"[^\w\s$%.'\-]"
    return re.sub(pattern, ' ', text)

In [10]:
data["sentence"] = data["sentence"].apply(remove_punc)

### Normalize

In [11]:
def replacements(text):
    # Substitutes 've with have
    text = re.sub(r"\b(\w+)'ve\b", r"\1 have", text)

    # Substitutes n't with not
    text = re.sub(r"\b(\w+)n't\b", r"\1 not", text)

    # Substitutes 'd with would
    text = re.sub(r"\b(\w+)'d\b", r"\1 would", text)

    # Substitutes 'll with will
    text = re.sub(r"\b(\w+)'ll\b", r"\1 will", text)

    # Substitutes 're with are
    text = re.sub(r"\b(\w+)'re\b", r"\1 are", text)

    # Substitutes 'm with am
    text = re.sub(r"\b(\w+)'m\b", r"\1 am", text)

    return text

In [12]:
data["sentence"] = data["sentence"].apply(replacements)

### Separate $, %, - and . from words/numbers

In [13]:
def add_spaces(text):
    # Add spaces before and after $, %, -, .
    pattern = r'(\$|%|-|\.)'
    return re.sub(pattern, r' \1 ', text)

In [14]:
data["sentence"] = data["sentence"].apply(add_spaces)

### Apply stemming

In [15]:
def remove_apostrophe_s(text):
    # Replace 's at the end of words
    return re.sub(r"\b(\w+)'s\b", r"\1", text)

In [16]:
data["sentence"] = data["sentence"].apply(remove_apostrophe_s)

In [17]:
for index, row in data.iterrows():
  print(row)

sentence_id                     b9e1635a-72aa-467f-86d6-f56ef09f62c3
sentence           gone are the days when they led the world in r...
label                                                           SUBJ
solved_conflict                                                 True
Name: 0, dtype: object
sentence_id                     f99b5143-70d2-494a-a2f5-c68f10d09d0a
sentence           the trend is expected to reverse as soon as ne...
label                                                            OBJ
solved_conflict                                                False
Name: 1, dtype: object
sentence_id            4076639c-aa56-4202-ae0f-9d9217f8da68
sentence           but there is the specious point again . 
label                                                   OBJ
solved_conflict                                       False
Name: 2, dtype: object
sentence_id                     b057c366-698e-419d-a284-9b16d835c64e
sentence           he added he would not be surprised to see a ne...
l

### Apply lemmatization

In [18]:
def lemmatize_sentence_spacy(sentence):
    # Load the English language model in spaCy
    nlp = spacy.load('en_core_web_sm')

    # Create a Doc object for the input sentence
    doc = nlp(sentence)

    # Lemmatize each token and join them back into a sentence
    lemmatized_sentence = ' '.join([token.lemma_ for token in doc])

    return lemmatized_sentence

In [19]:
data["sentence"] = data["sentence"].apply(lemmatize_sentence_spacy)

### Remove stopwords

In [20]:
def remove_stopwords_from_sentence(sentence):
    tokens = nltk.word_tokenize(sentence)

    stop_words = set(stopwords.words('english'))

    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    sentence_without_stopwords = ' '.join(filtered_tokens)

    return sentence_without_stopwords

In [21]:
data["sentence"] = data["sentence"].apply(remove_stopwords_from_sentence)

In [22]:
#Final output after preprocessing
data[:5]

Unnamed: 0,sentence_id,sentence,label,solved_conflict
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,go day lead world recession - bust,SUBJ,True
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,trend expect reverse soon next month .,OBJ,False
2,4076639c-aa56-4202-ae0f-9d9217f8da68,specious point .,OBJ,False
3,b057c366-698e-419d-a284-9b16d835c64e,add would surprised see new variant altogether...,OBJ,False
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,less government see amount government less mon...,SUBJ,False


### Create binary labels and remove "solved_conflict" column

Here, all labels that are "SUBJ" in column "label" are changed to 1 and "OBJ" are changed to 0. Additionally, the "solved_conflict" column is additional information that is not required. Thus, it is removed


In [23]:
data['label'] = data['label'].replace({'SUBJ': 1, 'OBJ': 0})
data = data.drop('solved_conflict', axis=1)

In [24]:
data[:5]

Unnamed: 0,sentence_id,sentence,label
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,go day lead world recession - bust,1
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,trend expect reverse soon next month .,0
2,4076639c-aa56-4202-ae0f-9d9217f8da68,specious point .,0
3,b057c366-698e-419d-a284-9b16d835c64e,add would surprised see new variant altogether...,0
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,less government see amount government less mon...,1


## Creating a data split of train, val and test


In [25]:
num_rows = len(data)
random.seed(42)
#FIX: Issue with appending obj and subj

#Isolating all rows containing OBJ and SUBJ to ensure an equal split of both in train, test and val
obj = data.loc[data['label'] == 0]
subj = data.loc[data['label'] == 1]

train_size_obj = int(0.8*len(obj))
train_size_subj = int(0.8*len(subj))
val_size_obj = int(0.1*len(obj))
val_size_subj = int(0.1*len(subj))

train_obj = obj.iloc[:train_size_obj, :]
val_obj = obj.iloc[train_size_obj: train_size_obj + val_size_obj, :]
test_obj = obj.iloc[train_size_obj + val_size_obj:, :]
train_subj = subj.iloc[:train_size_subj, :]
val_subj = subj.iloc[train_size_subj: train_size_subj + val_size_subj, :]
test_subj = subj.iloc[train_size_subj + val_size_subj:, :]

train = pd.concat([train_obj, train_subj])
val = pd.concat([val_obj, val_subj])
test = pd.concat([test_obj, test_subj])

train = train.sample(frac=1, random_state=42)
val = val.sample(frac=1, random_state=42)
test = test.sample(frac=1, random_state=42)

# Save each split into separate CSV files
train.to_csv(os.path.join(PATH, 'train.csv'), index=False)
val.to_csv(os.path.join(PATH, 'val.csv'), index=False)
test.to_csv(os.path.join(PATH, 'test.csv'), index=False)

In [26]:
print(f"Number of lines in the training set: {len(train)}")
print(f"Number of lines in the validation set: {len(val)}")
print(f"Number of lines in the testing set: {len(test)}")

Number of lines in the training set: 750
Number of lines in the validation set: 93
Number of lines in the testing set: 96


In [27]:
train.head(5)

Unnamed: 0,sentence_id,sentence,label
120,9ebd51e3-8f5a-4174-938f-1efdf3bfe85d,reason lack actual levelling ' quite clear peo...,1
572,99d0ce3f-dcbe-4ac7-bb1d-8c257c01d675,reason protest lightly use public credit save ...,0
207,c46b44ec-86be-488b-87b4-4bb41e9136db,new england journal medicine study prove effec...,0
393,bba8e3f8-129a-409c-a00b-21e9a6f7de4b,biden administration predict nearly 100 millio...,0
475,05d269a7-9c64-4d77-9aff-dc77bdbaad4a,movement movement within government independen...,0


In [28]:
val.head(5)

Unnamed: 0,sentence_id,sentence,label
784,2fa6e5cc-155a-40c5-b973-9a2b878bd01e,argue reason corporate law codifie shareholder...,0
756,843e214f-9632-4357-a9dd-38512cdd30f5,city assume new obligation write lend upson di...,0
809,d1b89811-6cb8-4f0f-a475-aea571919863,agreement still need ratify member process pla...,0
835,47ae6294-fab1-42d5-b23b-28ac7b4ee1fe,may faith human inventiveness prevail long run .,1
729,e39ee0e2-d109-4ec2-b9c4-eca24a00e007,hear .,0


In [29]:
test.head(5)

Unnamed: 0,sentence_id,sentence,label
908,0ff59567-45ef-41fd-a74e-622283a4d658,gop must much show party white americans aka p...,1
903,57221b29-dbf7-4ec8-b6c0-865da935d192,instead distract fantasy racial discrimination...,1
898,94b23e07-4978-4d12-9efe-9fcdcd59b1d1,single issue overcome mr . buchanan criticism ...,1
936,4d87c1ea-9983-41c9-bae3-0b34a98f8036,funny thing msm least partly right .,1
878,7636d762-d89f-4e40-b802-9ff9686ed265,half social housing resident rely credit buy -...,0
