In [1]:
!python -m spacy download en_core_web_sm

2023-12-09 22:12:22.486515: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-09 22:12:22.488186: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-09 22:12:22.488252: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-09 22:12:22.501863: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading htt

## Imports

In [2]:
import pandas as pd
import numpy as np
import os
import random
import string
import re

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

import spacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Merging Data

This is done to apply preprocessing to all sequences before dividing them into a train, val and test splits.

In [3]:
PATH = "path"

In [4]:
# train = pd.read_csv(os.path.join(PATH,"train.csv"))
# dev = pd.read_csv(os.path.join(PATH,"dev.csv"))

In [5]:
# data = pd.concat([train,dev])
# data.to_csv(os.path.join(PATH,'merged.csv'), index=False)
data = pd.read_csv(os.path.join(PATH, "merged.csv"))

In [6]:
data[:100]

Unnamed: 0,sentence_id,sentence,label,solved_conflict
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,Gone are the days when they led the world in r...,SUBJ,True
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,The trend is expected to reverse as soon as ne...,OBJ,False
2,4076639c-aa56-4202-ae0f-9d9217f8da68,But there is the specious point again.,OBJ,False
3,b057c366-698e-419d-a284-9b16d835c64e,He added he wouldn’t be surprised to see a new...,OBJ,False
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,"Not less government, you see; the same amount ...",SUBJ,False
...,...,...,...,...
95,921a1f46-1494-4004-95ed-d26b907d3aae,It has been a bumpy road even to this juncture.,SUBJ,False
96,c2ca3827-247d-4302-971d-8054d32c73aa,"Socialists believe that, if everyone cannot ha...",OBJ,True
97,58d703aa-131b-40b4-8373-0c34b86bb1c7,Vučić said he regretted it but cited a deepeni...,OBJ,False
98,cb0dd788-2074-4469-a8f9-24d163db41c6,“There have been more rightwing extremists spr...,OBJ,False


## Preprocessing

The text contains the following that will be changed/removed:


*   Capitalization of words, particularly proper nouns.
*   Punctuation specifically in words like [that's], [I'm], etc.
*   There are several " ’ " instead of " ' " that need to be changed before removing punctuation
* Some number are large enough that they are separated by commas. Example: "2,000,000". When removing commas any commas removed from numbers need to be done such that the meaning of the number remains the same. Example: "2,000,000" becomes "2000000" and not "2 000 000"

### Change text to lowercase

In [7]:
def to_lower(phrase):
  return phrase.lower()

In [8]:
data["sentence"] = data["sentence"].apply(to_lower)

### Remove punctuation

In [9]:
#Remove all punctuation except words that have [']
#Punctuation not removed $, %, -, .
def remove_punc(text):
    # Replace Unicode character ’ with '
    text = re.sub('’', "'", text)

    # Remove commas from numbers without introducing spaces
    text = re.sub(r'(\d),(\d)', r'\1\2', text)

    # Keep $, %, -, . and words with 's; replace other punctuation with spaces
    pattern = r"[^\w\s$%.'\-]"
    return re.sub(pattern, ' ', text)

In [10]:
data["sentence"] = data["sentence"].apply(remove_punc)

### Normalize

In [11]:
def replacements(text):
    # Substitutes 've with have
    text = re.sub(r"\b(\w+)'ve\b", r"\1 have", text)

    # Substitutes n't with not
    text = re.sub(r"\b(\w+)n't\b", r"\1 not", text)

    # Substitutes 'd with would
    text = re.sub(r"\b(\w+)'d\b", r"\1 would", text)

    # Substitutes 'll with will
    text = re.sub(r"\b(\w+)'ll\b", r"\1 will", text)

    # Substitutes 're with are
    text = re.sub(r"\b(\w+)'re\b", r"\1 are", text)

    # Substitutes 'm with am
    text = re.sub(r"\b(\w+)'m\b", r"\1 am", text)

    return text

In [12]:
data["sentence"] = data["sentence"].apply(replacements)

### Separate $, %, - and . from words/numbers

In [13]:
def add_spaces(text):
    # Add spaces before and after $, %, -, .
    pattern = r'(\$|%|-|\.)'
    return re.sub(pattern, r' \1 ', text)

In [14]:
data["sentence"] = data["sentence"].apply(add_spaces)

### Apply stemming

In [15]:
def remove_apostrophe_s(text):
    # Replace 's at the end of words
    return re.sub(r"\b(\w+)'s\b", r"\1", text)

In [16]:
data["sentence"] = data["sentence"].apply(remove_apostrophe_s)

### Apply lemmatization

In [17]:
def lemmatize_sentence_spacy(sentence):
    # Load the English language model in spaCy
    nlp = spacy.load('en_core_web_sm')

    # Create a Doc object for the input sentence
    doc = nlp(sentence)

    # Lemmatize each token and join them back into a sentence
    lemmatized_sentence = ' '.join([token.lemma_ for token in doc])

    return lemmatized_sentence

In [18]:
data["sentence"] = data["sentence"].apply(lemmatize_sentence_spacy)

### Remove stopwords

In [19]:
def remove_stopwords_from_sentence(sentence):
    tokens = nltk.word_tokenize(sentence)

    stop_words = set(stopwords.words('english'))

    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    sentence_without_stopwords = ' '.join(filtered_tokens)

    return sentence_without_stopwords

In [20]:
data["sentence"] = data["sentence"].apply(remove_stopwords_from_sentence)

In [21]:
#Final output after preprocessing
data[:5]

Unnamed: 0,sentence_id,sentence,label,solved_conflict
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,go day lead world recession - bust,SUBJ,True
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,trend expect reverse soon next month .,OBJ,False
2,4076639c-aa56-4202-ae0f-9d9217f8da68,specious point .,OBJ,False
3,b057c366-698e-419d-a284-9b16d835c64e,add would surprised see new variant altogether...,OBJ,False
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,less government see amount government less mon...,SUBJ,False


### Create binary labels and remove "solved_conflict" column

Here, all labels that are "SUBJ" in column "label" are changed to 1 and "OBJ" are changed to 0. Additionally, the "solved_conflict" column is additional information that is not required. Thus, it is removed


In [22]:
data['label'] = data['label'].replace({'SUBJ': 1, 'OBJ': 0})
data = data.drop('solved_conflict', axis=1)

In [23]:
data[:5]

Unnamed: 0,sentence_id,sentence,label
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,go day lead world recession - bust,1
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,trend expect reverse soon next month .,0
2,4076639c-aa56-4202-ae0f-9d9217f8da68,specious point .,0
3,b057c366-698e-419d-a284-9b16d835c64e,add would surprised see new variant altogether...,0
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,less government see amount government less mon...,1


## Creating a data split of train, val and test


In [24]:
num_rows = len(data)
random.seed(42)

shuffled_data = data.sample(frac=1, random_state=42)

train_size = int(0.8 * num_rows)
val_size = int(0.1 * num_rows)

train = shuffled_data.iloc[:train_size, :]
val = shuffled_data.iloc[train_size: train_size + val_size, :]
test = shuffled_data.iloc[train_size + val_size:, :]

# Save each split into separate CSV files
train.to_csv(os.path.join(PATH, 'train.csv'), index=False)
val.to_csv(os.path.join(PATH, 'val.csv'), index=False)
test.to_csv(os.path.join(PATH, 'test.csv'), index=False)

In [25]:
print(f"Number of lines in the training set: {len(train)}")
print(f"Number of lines in the validation set: {len(val)}")
print(f"Number of lines in the testing set: {len(test)}")

Number of lines in the training set: 751
Number of lines in the validation set: 93
Number of lines in the testing set: 95


In [26]:
train.head(5)

Unnamed: 0,sentence_id,sentence,label
299,d1264074-9bfb-45e4-b8f8-3e2ecd91ec2e,commitment government make seem start slogan p...,1
63,e14d1c51-0140-45f8-b234-0d29d271cec0,per capita cost government increase follow 188...,0
136,ad238ccb-6a49-4e20-ba92-626b605e2f15,public credit nation different .,0
597,33e18126-bf59-4bf8-a9c4-e93d53f477bd,sunday anti - pride demonstrator include biker...,0
261,2d9bea46-05b1-4927-8ff4-0dbb817df26c,price increase also see good service directly ...,0


In [27]:
val.head(5)

Unnamed: 0,sentence_id,sentence,label
897,7b7085cb-a71c-44b7-8cc0-1823ec89ca05,also true mass immigration control already 90 ...,1
577,8c75f51f-86b6-440f-aa59-6607a56a2d18,course bf . 7 transmissible variant .,0
85,827fb861-5a2f-4879-b20a-b06987f6c5d0,fda issue new guidance antiviral drug know tec...,0
242,0c8f0a07-13f5-4ae6-940e-84445e886481,preach gospel responsibility state administer ...,1
698,ba6ab67e-1d0a-4cc1-8ce6-db2e5b208bdb,another analysis official uk government datum ...,0


In [28]:
test.head(5)

Unnamed: 0,sentence_id,sentence,label
269,a88180be-fe81-454f-b4f1-3b9ee0d5239c,may industry accept responsibility unemploymen...,1
201,c7e50baa-1072-430f-b187-dfea384dc937,tentative agreement allow worker take unpaid s...,0
161,b29b7013-a58b-4cf3-aeab-4a2397fea6e9,somewhere - optimum point although never see d...,1
555,a35ff26e-ed09-4b2b-9c44-79518fe8e221,hero zero simple explanation change approach .,0
729,e39ee0e2-d109-4ec2-b9c4-eca24a00e007,hear .,0
