In [1]:
!python -m spacy download en_core_web_sm

2023-12-14 00:23:47.699601: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-14 00:23:47.699666: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-14 00:23:47.699722: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-14 00:23:47.728215: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading htt

## Imports

In [2]:
import pandas as pd
import numpy as np
import os
import random
import string
import re

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

import spacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Merging Data

This is done to apply preprocessing to all sequences before dividing them into a train, val and test splits.

In [3]:
PATH = "/content/drive/MyDrive/Class Stuff/Fall23/NLP/Final Project/Data"

In [4]:
# train = pd.read_csv(os.path.join(PATH,"train.csv"))
# dev = pd.read_csv(os.path.join(PATH,"dev.csv"))

In [5]:
# data = pd.concat([train,dev])
# data.to_csv(os.path.join(PATH,'merged.csv'), index=False)
data = pd.read_csv(os.path.join(PATH, "merged.csv"))

In [6]:
data[:100]

Unnamed: 0,sentence_id,sentence,label,solved_conflict
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,Gone are the days when they led the world in r...,SUBJ,True
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,The trend is expected to reverse as soon as ne...,OBJ,False
2,4076639c-aa56-4202-ae0f-9d9217f8da68,But there is the specious point again.,OBJ,False
3,b057c366-698e-419d-a284-9b16d835c64e,He added he wouldn’t be surprised to see a new...,OBJ,False
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,"Not less government, you see; the same amount ...",SUBJ,False
...,...,...,...,...
95,921a1f46-1494-4004-95ed-d26b907d3aae,It has been a bumpy road even to this juncture.,SUBJ,False
96,c2ca3827-247d-4302-971d-8054d32c73aa,"Socialists believe that, if everyone cannot ha...",OBJ,True
97,58d703aa-131b-40b4-8373-0c34b86bb1c7,Vučić said he regretted it but cited a deepeni...,OBJ,False
98,cb0dd788-2074-4469-a8f9-24d163db41c6,“There have been more rightwing extremists spr...,OBJ,False


## Preprocessing

The text contains the following that will be changed/removed:


*   Capitalization of words, particularly proper nouns.
*   Punctuation specifically in words like [that's], [I'm], etc.
*   There are several " ’ " instead of " ' " that need to be changed before removing punctuation.
* I have also changed any numbers including decimals to "number", "%" to "percentage" and "$" to "dollar".

In [7]:
def replace_symbols(sentence):
    # Replace "$" with "dollar"
    sentence = re.sub(r'\$', 'dollar ', sentence)

    # Replaces "%" with "percentage"
    sentence = re.sub(r'%', ' percentage', sentence)

    # Replace numbers (including decimals) with "number"
    sentence = re.sub(r'\b\d+(\.\d+)?\b', 'number', sentence)

    return sentence

In [8]:
data["sentence"] = data["sentence"].apply(replace_symbols)

### Change text to lowercase

In [9]:
def to_lower(phrase):
  return phrase.lower()

In [10]:
data["sentence"] = data["sentence"].apply(to_lower)

### Remove punctuation

In [11]:
#Remove all punctuation except words that have [']
#Punctuation not removed $, %, -, .
def remove_punc(text):
    # Replace Unicode character ’ with '
    text = re.sub('’', "'", text)

    pattern = r"[^\w\s']"
    return re.sub(pattern, '', text)

In [12]:
data["sentence"] = data["sentence"].apply(lambda x: remove_punc(x))

### Normalize

In [13]:
def replacements(text):
    # Substitutes 've with have
    text = re.sub(r"\b(\w+)'ve\b", r"\1 have", text)

    # Substitutes n't with not
    text = re.sub(r"\b(\w+)n't\b", r"\1 not", text)

    # Substitutes 'd with would
    text = re.sub(r"\b(\w+)'d\b", r"\1 would", text)

    # Substitutes 'll with will
    text = re.sub(r"\b(\w+)'ll\b", r"\1 will", text)

    # Substitutes 're with are
    text = re.sub(r"\b(\w+)'re\b", r"\1 are", text)

    # Substitutes 'm with am
    text = re.sub(r"\b(\w+)'m\b", r"\1 am", text)

    return text

In [14]:
data["sentence"] = data["sentence"].apply(replacements)

### Apply stemming

In [15]:
def remove_apostrophe_s(text):
    # Replace 's at the end of words
    return re.sub(r"\b(\w+)'s\b", r"\1", text)

In [16]:
data["sentence"] = data["sentence"].apply(remove_apostrophe_s)

### Apply lemmatization

In [17]:
def lemmatize_sentence_spacy(sentence):
    # Load the English language model in spaCy
    nlp = spacy.load('en_core_web_sm')

    # Create a Doc object for the input sentence
    doc = nlp(sentence)

    # Lemmatize each token and join them back into a sentence
    lemmatized_sentence = ' '.join([token.lemma_ for token in doc])

    return lemmatized_sentence

In [18]:
data["sentence"] = data["sentence"].apply(lemmatize_sentence_spacy)

### Remove stopwords

In [19]:
# When applying stopword removal in earlier iterations
# there were several instances where the len(sentence)<4
# Thus, to give the models more context, stop words were only removed
# for len(sentence)>4 when stop words were removed

def remove_stopwords_from_sentence(sentence):
    tokens = nltk.word_tokenize(sentence)

    stop_words = set(stopwords.words('english'))

    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    sentence_without_stopwords = ' '.join(filtered_tokens)

    if len(filtered_tokens) >= 4:
        return sentence_without_stopwords
    else:
        return sentence

In [20]:
data["sentence"] = data["sentence"].apply(remove_stopwords_from_sentence)

There are still several sentences that have ' in their text. Thus, the dataframe is run through the below method to get rid of all remaining '

In [21]:
count = 0
for index, row in data.iterrows():
    if "'" in row['sentence']:
        print(row['sentence'])
        count+=1
        if count==5:
          break

limit focus corporate board shareholder ' financial interest alone corporate law intend confine sphere influence corporation means protect democracy civic institution corporate overreachjust society confer certain legal advantage nonprofit corporation return confine activity sphere charitable cause
government serious equality levelling ' would look opposite bill
conservative reject ' boris johnson term clearly use positive sense drop reference level
even ' racist scandal ' prove rather contrive comparison way african head state force travel bus white gentleman ' joe biden could take car
fortune center disease control prevention recently break category case top number percentage dr stuart ray vice chair medicine datum integrity analytic johns hopkins ' department medicine tell fortune


In [22]:
def remove_leftover_single_quotes(sentence):
    return re.sub(r"'", '', sentence)

In [23]:
data["sentence"] = data["sentence"].apply(remove_leftover_single_quotes)

In [24]:
#Final output after preprocessing
data[:5]

Unnamed: 0,sentence_id,sentence,label,solved_conflict
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,go day lead world recessionbuste,SUBJ,True
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,trend expect reverse soon next month,OBJ,False
2,4076639c-aa56-4202-ae0f-9d9217f8da68,but there be the specious point again,OBJ,False
3,b057c366-698e-419d-a284-9b16d835c64e,add would surprised see new variant altogether...,OBJ,False
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,less government see amount government less money,SUBJ,False


### Create binary labels and remove "solved_conflict" column

Here, all labels that are "SUBJ" in column "label" are changed to 1 and "OBJ" are changed to 0. Additionally, the "solved_conflict" column is additional information that is not required. Thus, it is removed


In [25]:
data['label'] = data['label'].replace({'SUBJ': 1, 'OBJ': 0})
data = data.drop('solved_conflict', axis=1)
data = data.drop('sentence_id', axis=1)

In [26]:
data[:5]

Unnamed: 0,sentence,label
0,go day lead world recessionbuste,1
1,trend expect reverse soon next month,0
2,but there be the specious point again,0
3,add would surprised see new variant altogether...,0
4,less government see amount government less money,1


## Creating a data split of train, val and test


In [27]:
num_rows = len(data)
random.seed(42)

#Isolating all rows containing OBJ and SUBJ to ensure an equal split of both in train, test and val
obj = data.loc[data['label'] == 0]
subj = data.loc[data['label'] == 1]

train_size_obj = int(0.8*len(obj))
train_size_subj = int(0.8*len(subj))
val_size_obj = int(0.1*len(obj))
val_size_subj = int(0.1*len(subj))

train_obj = obj.iloc[:train_size_obj, :]
val_obj = obj.iloc[train_size_obj: train_size_obj + val_size_obj, :]
test_obj = obj.iloc[train_size_obj + val_size_obj:, :]
train_subj = subj.iloc[:train_size_subj, :]
val_subj = subj.iloc[train_size_subj: train_size_subj + val_size_subj, :]
test_subj = subj.iloc[train_size_subj + val_size_subj:, :]

train = pd.concat([train_obj, train_subj])
val = pd.concat([val_obj, val_subj])
test = pd.concat([test_obj, test_subj])

train = train.sample(frac=1, random_state=42)
val = val.sample(frac=1, random_state=42)
test = test.sample(frac=1, random_state=42)

# Save each split into separate CSV files
train.to_csv(os.path.join(PATH, 'train.csv'), index=False)
val.to_csv(os.path.join(PATH, 'val.csv'), index=False)
test.to_csv(os.path.join(PATH, 'test.csv'), index=False)

In [28]:
print(f"Number of lines in the training set: {len(train)}")
print(f"Number of lines in the validation set: {len(val)}")
print(f"Number of lines in the testing set: {len(test)}")

Number of lines in the training set: 750
Number of lines in the validation set: 93
Number of lines in the testing set: 96


In [29]:
train.head(5)

Unnamed: 0,sentence,label
120,reason lack actual levelling quite clear peop...,1
572,reason protest lightly use public credit save ...,0
207,new england journal medicine study prove effec...,0
393,biden administration predict nearly number mil...,0
475,movementa movement within government independe...,0


In [30]:
val.head(5)

Unnamed: 0,sentence,label
784,argue reason corporate law codifie shareholder...,0
756,city assume new obligation write lend upson di...,0
809,agreement still need ratify member process pla...,0
835,may faith human inventiveness prevail long run,1
729,that you will be hear,0


In [31]:
test.head(5)

Unnamed: 0,sentence,label
908,gop must much show party white americans aka p...,1
903,instead distract fantasy racial discrimination...,1
898,single issue overcome mr buchanan criticism im...,1
936,funny thing msm least partly right,1
878,half social housing resident rely credit buyno...,0
