In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import torch

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

cuda_available = torch.cuda.is_available()
device = 'cuda:0' if cuda_available else 'cpu'

torch.manual_seed(0)

data = pd.read_csv("data/dontpatronizeme_pcl.tsv",
                       sep="\t",
                       names=['par_id', 'art_id', 'keyword', 'country', 'text', 'label'],
                       skiprows=4)

data['label'] = data['label'].apply(lambda x: 0 if x in [0, 1] else 1)

trids = pd.read_csv('data/train_semeval_parids-labels.csv')
teids = pd.read_csv('data/dev_semeval_parids-labels.csv')

trids.par_id = trids.par_id
teids.par_id = teids.par_id

data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\taow\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\taow\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,par_id,art_id,keyword,country,text,label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0
2,3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0
4,5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0
...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,Sri Lankan norms and culture inhibit women fro...,0
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0
10466,10467,@@20282330,in-need,ng,""" She has one huge platform , and information ...",1
10467,10468,@@16753236,hopeless,in,""" Anja Ringgren Loven I ca n't find a word to ...",1


In [2]:
# Rebuild training set

rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

train_set = pd.DataFrame(rows)
# Split train into train and internal validation set 80:20

val_size = int(len(train_set) * 0.2)

train_set = train_set.sample(frac=1)
val_set = train_set.iloc[0:val_size].reset_index(drop=True).copy()
train_set = train_set.iloc[val_size:].reset_index(drop=True).copy()

In [8]:
# Rebuild test set

rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

test_set = pd.DataFrame(rows)
test_set = test_set.sample(frac=1)
test_set = test_set.dropna()

In [4]:
# Function to preprocess documents
def preprocess(documents):
    processed_docs = []
    stop_words = set(stopwords.words("english"))  # Load stop words once to improve efficiency
    for document in documents:
        words = word_tokenize(document.lower())
        filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
        processed_docs.append(" ".join(filtered_words))
    return processed_docs

documents_train = preprocess(train_set["text"])
y_train = train_set["label"].values

documents_dev = preprocess(val_set["text"])
y_dev = val_set["label"].values

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(documents_train)
X_dev = vectorizer.transform(documents_dev)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predictions
dev_predictions = nb_classifier.predict(X_dev)

# Evaluation metrics
accuracy = accuracy_score(y_dev, dev_predictions)
f1 = f1_score(y_dev, dev_predictions)

print(f"Development Set Accuracy: {accuracy}")
print(f"Development Set F1 Score: {f1}")

Development Set Accuracy: 0.9062686567164179
Development Set F1 Score: 0.11299435028248589


dev set

In [9]:
documents_test = preprocess(test_set["text"])

# Vectorize the test documents using the same vectorizer used for training
X_test = vectorizer.transform(documents_test).toarray()
y_test = test_set["label"].values

test_predictions = nb_classifier.predict(X_test)

# Calculate evaluation metrics
test_accuracy = nb_classifier.score(X_test, y_test)
test_f1score = f1_score(y_test, test_predictions)

print(f"Test score accuracy: {test_accuracy}")
print(f"Test F1 score: {test_f1score}")

Test score accuracy: 0.9053989488772097
Test F1 score: 0.11607142857142858
