In [1]:
!python -m spacy download en_core_web_sm

2023-12-10 15:46:03.234384: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-10 15:46:03.234464: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-10 15:46:03.234515: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-10 15:46:03.256640: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading htt

## Imports

In [2]:
import pandas as pd
import numpy as np
import os
import random
import string
import re

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

import spacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Merging Data

This is done to apply preprocessing to all sequences before dividing them into a train, val and test splits.

In [3]:
PATH = "/content/drive/MyDrive/Class Stuff/Fall23/NLP/Final Project/Data"

In [4]:
# train = pd.read_csv(os.path.join(PATH,"train.csv"))
# dev = pd.read_csv(os.path.join(PATH,"dev.csv"))

In [5]:
# data = pd.concat([train,dev])
# data.to_csv(os.path.join(PATH,'merged.csv'), index=False)
data = pd.read_csv(os.path.join(PATH, "merged.csv"))

In [6]:
data[:100]

Unnamed: 0,sentence_id,sentence,label,solved_conflict
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,Gone are the days when they led the world in r...,SUBJ,True
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,The trend is expected to reverse as soon as ne...,OBJ,False
2,4076639c-aa56-4202-ae0f-9d9217f8da68,But there is the specious point again.,OBJ,False
3,b057c366-698e-419d-a284-9b16d835c64e,He added he wouldn’t be surprised to see a new...,OBJ,False
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,"Not less government, you see; the same amount ...",SUBJ,False
...,...,...,...,...
95,921a1f46-1494-4004-95ed-d26b907d3aae,It has been a bumpy road even to this juncture.,SUBJ,False
96,c2ca3827-247d-4302-971d-8054d32c73aa,"Socialists believe that, if everyone cannot ha...",OBJ,True
97,58d703aa-131b-40b4-8373-0c34b86bb1c7,Vučić said he regretted it but cited a deepeni...,OBJ,False
98,cb0dd788-2074-4469-a8f9-24d163db41c6,“There have been more rightwing extremists spr...,OBJ,False


## Preprocessing

The text contains the following that will be changed/removed:


*   Capitalization of words, particularly proper nouns.
*   Punctuation specifically in words like [that's], [I'm], etc.
*   There are several " ’ " instead of " ' " that need to be changed before removing punctuation
* Some number are large enough that they are separated by commas. Example: "2,000,000". When removing commas any commas removed from numbers need to be done such that the meaning of the number remains the same. Example: "2,000,000" becomes "2000000" and not "2 000 000"

### Change text to lowercase

In [7]:
def to_lower(phrase):
  return phrase.lower()

In [8]:
data["sentence"] = data["sentence"].apply(to_lower)

### Remove punctuation

In [9]:
#Remove all punctuation except words that have [']
#Punctuation not removed $, %, -, .
def remove_punc(text):
    # Replace Unicode character ’ with '
    text = re.sub('’', "'", text)

    # Remove commas from numbers without introducing spaces
    text = re.sub(r'(\d),(\d)', r'\1\2', text)

    # Keep $, %, -, . and words with 's; replace other punctuation with spaces
    pattern = r"[^\w\s$%.'\-]"
    return re.sub(pattern, ' ', text)

In [10]:
data["sentence"] = data["sentence"].apply(remove_punc)

### Normalize

In [11]:
def replacements(text):
    # Substitutes 've with have
    text = re.sub(r"\b(\w+)'ve\b", r"\1 have", text)

    # Substitutes n't with not
    text = re.sub(r"\b(\w+)n't\b", r"\1 not", text)

    # Substitutes 'd with would
    text = re.sub(r"\b(\w+)'d\b", r"\1 would", text)

    # Substitutes 'll with will
    text = re.sub(r"\b(\w+)'ll\b", r"\1 will", text)

    # Substitutes 're with are
    text = re.sub(r"\b(\w+)'re\b", r"\1 are", text)

    # Substitutes 'm with am
    text = re.sub(r"\b(\w+)'m\b", r"\1 am", text)

    return text

In [12]:
data["sentence"] = data["sentence"].apply(replacements)

### Separate $, %, - and . from words/numbers

In [13]:
def add_spaces(text):
    # Add spaces before and after $, %, -, .
    pattern = r'(\$|%|-|\.)'
    return re.sub(pattern, r' \1 ', text)

In [14]:
data["sentence"] = data["sentence"].apply(add_spaces)

### Apply stemming

In [15]:
def remove_apostrophe_s(text):
    # Replace 's at the end of words
    return re.sub(r"\b(\w+)'s\b", r"\1", text)

In [16]:
data["sentence"] = data["sentence"].apply(remove_apostrophe_s)

In [None]:
for index, row in data.iterrows():
  print(row)

### Apply lemmatization

In [17]:
def lemmatize_sentence_spacy(sentence):
    # Load the English language model in spaCy
    nlp = spacy.load('en_core_web_sm')

    # Create a Doc object for the input sentence
    doc = nlp(sentence)

    # Lemmatize each token and join them back into a sentence
    lemmatized_sentence = ' '.join([token.lemma_ for token in doc])

    return lemmatized_sentence

In [18]:
data["sentence"] = data["sentence"].apply(lemmatize_sentence_spacy)

### Remove stopwords

In [19]:
def remove_stopwords_from_sentence(sentence):
    tokens = nltk.word_tokenize(sentence)

    stop_words = set(stopwords.words('english'))

    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    sentence_without_stopwords = ' '.join(filtered_tokens)

    return sentence_without_stopwords

In [20]:
data["sentence"] = data["sentence"].apply(remove_stopwords_from_sentence)

In [21]:
#Final output after preprocessing
data[:5]

Unnamed: 0,sentence_id,sentence,label,solved_conflict
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,go day lead world recession - bust,SUBJ,True
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,trend expect reverse soon next month .,OBJ,False
2,4076639c-aa56-4202-ae0f-9d9217f8da68,specious point .,OBJ,False
3,b057c366-698e-419d-a284-9b16d835c64e,add would surprised see new variant altogether...,OBJ,False
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,less government see amount government less mon...,SUBJ,False


### Create binary labels and remove "solved_conflict" column

Here, all labels that are "SUBJ" in column "label" are changed to 1 and "OBJ" are changed to 0. Additionally, the "solved_conflict" column is additional information that is not required. Thus, it is removed


In [22]:
data['label'] = data['label'].replace({'SUBJ': 1, 'OBJ': 0})
data = data.drop('solved_conflict', axis=1)

In [23]:
data[:5]

Unnamed: 0,sentence_id,sentence,label
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,go day lead world recession - bust,1
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,trend expect reverse soon next month .,0
2,4076639c-aa56-4202-ae0f-9d9217f8da68,specious point .,0
3,b057c366-698e-419d-a284-9b16d835c64e,add would surprised see new variant altogether...,0
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,less government see amount government less mon...,1


## Creating a data split of train, val and test


In [26]:
num_rows = len(data)
random.seed(42)
#FIX: Issue with appending obj and subj

#Isolating all rows containing OBJ and SUBJ to ensure an equal split of both in train, test and val
obj = data.loc[data['label'] == 0]
subj = data.loc[data['label'] == 1]

train_size_obj = int(0.8*len(obj))
train_size_subj = int(0.8*len(subj))
val_size_obj = int(0.1*len(obj))
val_size_subj = int(0.1*len(subj))

train = obj.iloc[:train_size_obj, :]
val = obj.iloc[train_size_obj: train_size_obj + val_size_obj, :]
test = obj.iloc[train_size_obj + val_size_obj:, :]
train = subj.iloc[:train_size_subj, :]
val = subj.iloc[train_size_subj: train_size_subj + val_size_subj, :]
test = subj.iloc[train_size_subj + val_size_subj:, :]

train = train.sample(frac=1, random_state=42)
val = val.sample(frac=1, random_state=42)
test = test.sample(frac=1, random_state=42)

# Save each split into separate CSV files
train.to_csv(os.path.join(PATH, 'train.csv'), index=False)
val.to_csv(os.path.join(PATH, 'val.csv'), index=False)
test.to_csv(os.path.join(PATH, 'test.csv'), index=False)

In [27]:
print(f"Number of lines in the training set: {len(train)}")
print(f"Number of lines in the validation set: {len(val)}")
print(f"Number of lines in the testing set: {len(test)}")

Number of lines in the training set: 286
Number of lines in the validation set: 35
Number of lines in the testing set: 37


In [28]:
train.head(5)

Unnamed: 0,sentence_id,sentence,label
28,eaebd7db-6f3d-4c70-98d6-34b225389533,first habit one think traditional image machin...,1
748,3dd7cedd-5292-4a2c-9d2c-0b1ffc47c13c,way exchange freedom something else security s...,1
394,626870c1-1d6e-44a9-ab4a-3b0e2e722eec,vučić former far - right radical become prime ...,1
567,a287c7ad-4405-4673-bc10-0548d0ddc7ad,impressive number charity seem focus somalis m...,1
614,6a052558-8985-4825-a2a9-2bb8a0fdd512,zombie neoliberalism level already rich .,1


In [29]:
val.head(5)

Unnamed: 0,sentence_id,sentence,label
854,ba789da0-801c-43ce-9d77-f5aac5d6c4aa,cost living crisis exacerbate long - run ever ...,1
833,d2fff27d-5159-4d75-8960-8a69dc9fb4a0,dr . tanton choice may seem astounding view pe...,1
852,78fce583-32bb-46ce-95f0-e2302c9c0ee3,even liz handout yes tax cut ' truss ministry ...,1
846,1a8992a7-8686-4e96-b4b9-f10fd54e75a9,paraphrase hero manner even laffer curve - lov...,1
837,0d0c5c78-1533-4424-bbcf-56ec6f6fa666,wailing betray fatal tendency non - economist ...,1


In [30]:
test.head(5)

Unnamed: 0,sentence_id,sentence,label
902,6edaafad-75bf-41ef-9b6f-1752aadd2c2d,least immediate moment normal life plunge exte...,1
897,7b7085cb-a71c-44b7-8cc0-1823ec89ca05,also true mass immigration control already 90 ...,1
883,da0f5ab6-49cf-4238-871c-4b669fb349e8,could possibly go wrong,1
926,1ccb946b-24a5-4d85-82e7-e0816b81011a,simultaneously practice eye surgeon formidable...,1
936,4d87c1ea-9983-41c9-bae3-0b34a98f8036,funny thing msm least partly right .,1
