In [1]:
from __future__ import annotations
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Loading the data

In [2]:
df_train: pd.DataFrame = pd.read_csv("data/train.csv")
df_test: pd.DataFrame = pd.read_csv("data/test.csv")

In [3]:
with open("data/train_embedded.npy", "rb") as f:
    embedded_train: np.ndarray = np.load(f)
    
with open("data/test_embedded.npy", "rb") as f:
    embedded_test: np.ndarray = np.load(f)

# Creating a Dataset

In [4]:
class TextClassificationDataset(Dataset):
    def __init__(self, text: list[str], labels: list[int], embeddings: np.ndarray):
        self.text: np.ndarray = np.array(text)
        self.labels: np.ndarray = np.array(labels)
        self.embeddings: np.ndarray = embeddings

    def __len__(self):
        return self.embeddings.shape[0]

    def __getitem__(self, idx: int):
        return self.text[idx], self.labels[idx], self.embeddings[idx]

In [5]:
training_data = TextClassificationDataset(df_train["review"], 
                                          df_train["label"].apply(lambda x: 1 if x == "good" else 0), 
                                          embedded_train)

test_data = TextClassificationDataset(df_test["review"], 
                                      df_test["label"].apply(lambda x: 1 if x == "good" else 0), 
                                      embedded_test)

# Training a model

In [94]:
class SimpleLinearClassifier(torch.nn.Module):
    def __init__(self, dims_input: int, dims_output: int, dropout: float = 0.0):
        super().__init__()
        self.linear = torch.nn.Linear(dims_input, dims_output)
        self.sigmoid = torch.nn.Sigmoid()
        self.dropout = torch.nn.Dropout(p=dropout)
        
    def forward(self, x):
        x = self.dropout(x)
        return self.linear(x)
        
    def inference(self, x):
        y = self.forward(x)
        return self.sigmoid(y)

In [14]:
def train(model, dataloader, criterion, optimizer) -> float:
    model.train()
    losses: list[float] = []
    for text, labels, embeddings in iter(dataloader):
        # Needed for L-BFGS
        def closure():
            optimizer.zero_grad()
            output = model(embeddings)
            loss = criterion(output.squeeze(), labels.float().squeeze())
            loss.backward()
            losses.append(float(loss))
            return loss
    
        optimizer.step(closure)
        
    return np.mean(losses)

In [35]:
def evaluate(model, dataloader, criterion, optimizer) -> float:
    model.eval()
    losses: list[float] = []
    with torch.no_grad():
        for text, labels, embeddings in iter(dataloader):
            output = model(embeddings)
            loss = criterion(output.squeeze(), labels.float().squeeze())
            losses.append(float(loss))
            
    return np.mean(losses)

In [55]:
def measure_performance(model, dataloader) -> dict:
    model.eval()
    confusion_matrix: dict = {"TP": 0, "FP": 0, "FN": 0, "TN": 0}
    with torch.no_grad():
        for text, labels, embeddings in iter(dataloader):
            output = model.inference(embeddings)
            output[output >= 0.5] = 1
            output[output < 0.5] = 0
            output = output.long().squeeze()
            
            for i in range(len(output)):
                y = labels[i]
                y_hat = output[i]
                if (y == 1) and (y_hat == 1):
                    confusion_matrix["TP"] += 1
                elif (y == 0) and (y_hat == 1):
                    confusion_matrix["FP"] += 1
                elif (y == 1) and (y_hat == 0):
                    confusion_matrix["FN"] += 1
                elif (y == 0) and (y_hat == 0):
                    confusion_matrix["TN"] += 1
                    
    
    TP: int = confusion_matrix["TP"]
    FP: int = confusion_matrix["FP"]
    FN: int = confusion_matrix["FN"]
    TN: int = confusion_matrix["TN"]
    
    try:
        precision: float = round(TP / (TP + FP) * 100, 1)
    except Exception as e:
        precision: float = 0.0
        
    try:
        recall: float = round(TP / (TP + FN) * 100, 1)
    except Exception as e:
        recall: float = 0.0
        
    try:
        f1: float = round((2 * TP) / (2 * TP + FP + FN) * 100, 1)
    except Exception as e:
        f1: float = 0.0
        
    try:
        acc: float = round((TP + TN) / (TP + TN + FP + FN) * 100, 1)
    except Exception as e:
        acc: float = 0.0
        
    return {"F1": f1, "P": precision, "R": recall, "A": acc}

In [196]:
LEARNING_RATE: float = 1e-2
NUM_EPOCHS: int = 10
BATCH_SIZE: int = 512
DROPOUT: float = 0.4

In [112]:
train_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=len(test_data), shuffle=False)

In [197]:
model = SimpleLinearClassifier(768, 1, dropout=DROPOUT)
criterion = torch.nn.BCEWithLogitsLoss()
#optimizer = torch.optim.LBFGS(model.parameters(), lr=LEARNING_RATE, max_iter=20, history_size=200)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [198]:
%%time
for i in range(NUM_EPOCHS):
    train_loss: float = train(model, train_dataloader, criterion, optimizer)
    test_loss: float = evaluate(model, test_dataloader, criterion, optimizer)
    perf: dict = measure_performance(model, test_dataloader)
    
    print(f"Epoch     : {i}")
    print(f"Train loss: {train_loss}")
    print(f"Test loss : {test_loss}")
    print(perf)
    print()

Epoch     : 0
Train loss: 0.6914991277256651
Test loss : 0.6837975382804871
{'F1': 86.3, 'P': 83.4, 'R': 89.4, 'A': 85.8}

Epoch     : 1
Train loss: 0.6903662564651818
Test loss : 0.6790924072265625
{'F1': 91.3, 'P': 90.2, 'R': 92.4, 'A': 91.2}

Epoch     : 2
Train loss: 0.6892842017292407
Test loss : 0.6744468808174133
{'F1': 91.6, 'P': 86.8, 'R': 97.0, 'A': 91.1}

Epoch     : 3
Train loss: 0.6877652238430589
Test loss : 0.6700020432472229
{'F1': 92.2, 'P': 88.0, 'R': 96.8, 'A': 91.8}

Epoch     : 4
Train loss: 0.6864548111646369
Test loss : 0.6656339764595032
{'F1': 93.0, 'P': 89.5, 'R': 96.8, 'A': 92.7}

Epoch     : 5
Train loss: 0.6851561645571695
Test loss : 0.6614471673965454
{'F1': 93.3, 'P': 90.1, 'R': 96.8, 'A': 93.1}

Epoch     : 6
Train loss: 0.6843689845509506
Test loss : 0.6575391888618469
{'F1': 94.3, 'P': 93.2, 'R': 95.4, 'A': 94.2}

Epoch     : 7
Train loss: 0.6831175579408709
Test loss : 0.6535654067993164
{'F1': 94.4, 'P': 93.2, 'R': 95.6, 'A': 94.3}

Epoch     : 8
Tr

Without addressing the quality of the training data an accuracy of of 94.4% can be achieved

# Refining the training data
After examining the training data it is clear that items containing markdown frequently are mislabelled.

Applying a simple filter removes 2,668 samples, but the class balance remains (bad 1999, good 1999).

In [206]:
df_train = df_train[df_train["review"].apply(lambda x: "<" not in x)]

In [214]:
training_data = TextClassificationDataset(df_train["review"], 
                                          df_train["label"].apply(lambda x: 1 if x == "good" else 0), 
                                          embedded_train[df_train.index])

test_data = TextClassificationDataset(df_test["review"], 
                                      df_test["label"].apply(lambda x: 1 if x == "good" else 0), 
                                      embedded_test)

In [232]:
LEARNING_RATE: float = 5e-1
NUM_EPOCHS: int = 50
BATCH_SIZE: int = 128
DROPOUT: float = 0.0

In [216]:
train_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=len(test_data), shuffle=False)

In [233]:
model = SimpleLinearClassifier(768, 1, dropout=DROPOUT)
criterion = torch.nn.BCEWithLogitsLoss()
#optimizer = torch.optim.LBFGS(model.parameters(), lr=LEARNING_RATE, max_iter=20, history_size=200)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [234]:
%%time
for i in range(NUM_EPOCHS):
    train_loss: float = train(model, train_dataloader, criterion, optimizer)
    test_loss: float = evaluate(model, test_dataloader, criterion, optimizer)
    perf: dict = measure_performance(model, test_dataloader)
    
    print(f"Epoch     : {i}")
    print(f"Train loss: {train_loss}")
    print(f"Test loss : {test_loss}")
    print(perf)
    print()

Epoch     : 0
Train loss: 0.6762487068772316
Test loss : 0.6551909446716309
{'F1': 89.0, 'P': 82.4, 'R': 96.6, 'A': 88.0}

Epoch     : 1
Train loss: 0.6407565698027611
Test loss : 0.6215054988861084
{'F1': 89.7, 'P': 83.7, 'R': 96.6, 'A': 88.9}

Epoch     : 2
Train loss: 0.6092429384589195
Test loss : 0.5912564992904663
{'F1': 91.7, 'P': 88.0, 'R': 95.6, 'A': 91.3}

Epoch     : 3
Train loss: 0.5811997056007385
Test loss : 0.5641090273857117
{'F1': 92.2, 'P': 89.3, 'R': 95.2, 'A': 91.9}

Epoch     : 4
Train loss: 0.5559907183051109
Test loss : 0.5396519899368286
{'F1': 92.5, 'P': 90.1, 'R': 95.0, 'A': 92.3}

Epoch     : 5
Train loss: 0.5328267216682434
Test loss : 0.5174843668937683
{'F1': 93.0, 'P': 91.7, 'R': 94.4, 'A': 92.9}

Epoch     : 6
Train loss: 0.5118910446763039
Test loss : 0.49734389781951904
{'F1': 93.6, 'P': 93.6, 'R': 93.6, 'A': 93.6}

Epoch     : 7
Train loss: 0.4932027831673622
Test loss : 0.4789845645427704
{'F1': 93.7, 'P': 93.4, 'R': 94.0, 'A': 93.7}

Epoch     : 8
T

Slight improvement in accuracy to 95.8%