# NLP Lab 03 - Logistic regression classifier

Authors:
* Aurelien ROUXEL
* Ethan MACHAVOINE
* Jonathan POELGER

In [66]:
import matplotlib.pyplot as plt
import datasets as ds
import math
import string
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

import torch
from torch import nn

## Features

In [67]:
ds_train = ds.load_dataset("imdb", split="train")
ds_test = ds.load_dataset("imdb", split="test")

Found cached dataset imdb (/home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


In [68]:
text_train, label_train = np.array(ds_train["text"]), np.array(ds_train["label"])
text_test, label_test = np.array(ds_test["text"]), np.array(ds_test["label"])

In [69]:
def preprocessing(base_text: str):
  """
  Preprocess the text before classification
  Args:
    base_text: the string to preprocess
  Return:
    The preprocessed text
  """
  base_text = base_text.lower()
  base_text = base_text.replace("<br />",' ')
  text = ""
  ponct = string.punctuation.replace("!", '')
  for char in base_text:
    if char in ponct:
      text += ' '
    else:
      text += char
  return text

vectorized_preprocessing = np.vectorize(preprocessing) 

text_train, text_test = vectorized_preprocessing(text_train), vectorized_preprocessing(text_test)

In [71]:
def load_vader_lexicon(filename = "vader_lexicon.txt"):
    """
    Load the lexicon from VADER sentiment
    Args:
        filename: the name of file containing the VADER lexicon
    Return
        positive_lexicon: a numpy.array containg positive words
        negative_lexicon: a numpy.array containg negative words
    """
    loaded_lexicon = None
    with open(filename, "r") as f:
        loaded_lexicon = f.read()
    if loaded_lexicon == None:
        return None, None
    positive_lexicon = []
    negative_lexicon = []
    for line in loaded_lexicon.rstrip('\n').split('\n'):
        if not line:
            continue
        (word, measure) = line.strip().split('\t')[0:2]
        measure = float(measure)
        if (measure >= 1):
            positive_lexicon.append(word)
        elif (measure <= -1):
            negative_lexicon.append(word)
    return np.array(positive_lexicon), np.array(negative_lexicon)

positive_lexicon, negative_lexicon = load_vader_lexicon()

In [72]:
def extract_features(text):
    """
    Take a text and extract the features from it
    Args:
        text: the text to extract features from
    Returns:
        numpy.array of size 6 containing the features
    """
    words = np.array(text.replace('!', '').split())
    no_value = 0
    if "no" in words:
        no_value = 1
    excla_value = 0
    if '!' in text:
        excla_value = 1
    log_word_count = math.log(words.shape[0])
    positive_words = words[np.isin(words, positive_lexicon)].shape[0]
    negative_words = words[np.isin(words, negative_lexicon)].shape[0]
    first_and_second_pronouns = np.concatenate((words[words == 'i'], words[words == 'you']), axis = 0).shape[0]
    return np.array([no_value, first_and_second_pronouns, excla_value, 
                     log_word_count, positive_words, negative_words])

vectorized_extract_features = np.vectorize(extract_features, signature='()->(6)')

## Logistic regression classifier

### Applying the features extractor

In [73]:
X_train = vectorized_extract_features(text_train)
X_test = vectorized_extract_features(text_test)

### Splitting training datase between training set and validation set

In [74]:
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(label_train, dtype=torch.float32).reshape(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(label_test, dtype=torch.float32).reshape(-1, 1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train,
    y_train,
    test_size=0.15,
    stratify=y_train,
    random_state=42,
)

### Classifier class

In [75]:
class LogisticRegression(nn.Module):
    """A logistic regression implementation"""

    def __init__(self, input_dim: int, nb_classes: int) -> None:
        """
        Args:
            input_dim: the dimension of the input features.
            nb_classes: the number of classes to predict.
        """
        super().__init__()
        output_layer = nn.Sigmoid() if nb_classes == 1 else nn.Softmax()
        self.classifier = torch.nn.Sequential(
            nn.Linear(input_dim, nb_classes),
            output_layer,
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: the input tensor.
        Returns:
            The output of activation function.
        """
        return self.classifier(x)

### Creating the Classifier

In [87]:
model = LogisticRegression(6, 1)
criterion = nn.BCELoss()  # Binary cross entropy
# Stochastic gradient descent
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=0.5)

### Training the model

In [88]:
%%time

n_epochs = 1000

# Keeping an eye on the losses
train_losses = []
test_losses = []

# Training loop
for epoch in range(n_epochs):
    # Setting all gradients to zero.
    optimizer.zero_grad()

    # Sending the whole training set through the model.
    predictions = model(X_train)
    # Computing the loss.
    loss = criterion(predictions, y_train)
    train_losses.append(loss.item())
    if epoch % 100 == 0:
        print(loss)
    # Computing the gradients and gradient descent.
    loss.backward()
    optimizer.step()

    # When computing the validation loss, we do not want to update the weights.
    # torch.no_grad tells PyTorch to not save the necessary data used for
    # gradient descent.
    with torch.no_grad():
        predictions = model(X_valid)
        loss = criterion(predictions, y_valid)
        test_losses.append(loss)

tensor(3.4128, grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.5858, grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.5827, grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.5819, grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.5815, grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.5813, grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.5812, grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.5812, grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.5811, grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.5811, grad_fn=<BinaryCrossEntropyBackward0>)
CPU times: user 7.48 s, sys: 0 ns, total: 7.48 s
Wall time: 1.27 s


### Computing the accuracy for our 3 splits.

In [89]:
with torch.no_grad():
    p_train = model(X_train)
    p_train = np.round(p_train.numpy())
    training_accuracy = np.mean(p_train == y_train.numpy())
    p_valid = model(X_valid)
    p_valid = np.round(p_valid.numpy())
    valid_accuracy = np.mean(p_valid == y_valid.numpy())
    p_test = model(X_test)
    p_test = np.round(p_test.numpy())
    test_accuracy = np.mean(p_test == y_test.numpy())
print(f"Training accuracy: {training_accuracy}")
print(f"Validation accuracy: {valid_accuracy}")
print(f"Test accuracy: {test_accuracy}")

Training accuracy: 0.7145882352941176
Validation accuracy: 0.7058666666666666
Test accuracy: 0.71348


Results:
* Training accuracy: 0.7145882352941176
* Validation accuracy: 0.7058666666666666
* Test accuracy: 0.71348


### Looking at the model's weights

In [90]:
model.classifier[0].state_dict()["weight"]

tensor([[-0.0281, -0.0488, -0.0070, -0.0132,  0.1276, -0.1493]])

Weights:
* tensor([[-0.0281, -0.0488, -0.0070, -0.0132,  0.1276, -0.1493]])

### Which features seems to play most for both classes ?

    It seems that the features that play most for both classes are the count of positive words and negative words

### Taking two wrongly classified samples

In [134]:
[X_test[i] for i in range(len(X_test)) if y_test[i] != torch.tensor(p_test)[i]][3:5]

[tensor([ 0.0000,  5.0000,  1.0000,  5.1761, 11.0000,  7.0000]),
 tensor([0.0000, 3.0000, 0.0000, 4.8675, 7.0000, 2.0000])]

Wrongly classified:
* tensor([ 0.0000,  5.0000,  1.0000,  5.1761, 11.0000,  7.0000])
* tensor([0.0000, 3.0000, 0.0000, 4.8675, 7.0000, 2.0000])

In [135]:
[p_test[i] for i in range(len(X_test)) if y_test[i] != torch.tensor(p_test)[i]][3:5]

[array([1.], dtype=float32), array([1.], dtype=float32)]

We can see that those two examples have been wronlgy classified as positives, now, let's take a look at a true positives and true negatives

In [144]:
[X_test[i] for i in range(len(X_test)) if y_test[i] == 1][5:10]

[tensor([ 0.0000, 11.0000,  0.0000,  5.8665, 13.0000,  5.0000]),
 tensor([ 1.0000, 15.0000,  1.0000,  6.8773, 42.0000, 34.0000]),
 tensor([ 1.0000,  0.0000,  0.0000,  5.3519, 10.0000,  6.0000]),
 tensor([ 1.0000, 10.0000,  0.0000,  6.2166, 18.0000, 27.0000]),
 tensor([1.0000, 1.0000, 0.0000, 5.1240, 7.0000, 5.0000])]

True positives examples:
* tensor([ 0.0000, 11.0000,  0.0000,  5.8665, 13.0000,  5.0000]),
* tensor([ 1.0000, 15.0000,  1.0000,  6.8773, 42.0000, 34.0000]),
* tensor([ 1.0000,  0.0000,  0.0000,  5.3519, 10.0000,  6.0000]),
* tensor([ 1.0000, 10.0000,  0.0000,  6.2166, 18.0000, 27.0000]),
* tensor([1.0000, 1.0000, 0.0000, 5.1240, 7.0000, 5.0000])

In [145]:
[X_test[i] for i in range(len(X_test)) if y_test[i] == 0][5:10]

[tensor([ 1.0000,  1.0000,  0.0000,  5.2470,  8.0000, 11.0000]),
 tensor([ 1.0000,  4.0000,  0.0000,  5.6937, 23.0000, 12.0000]),
 tensor([ 0.0000,  4.0000,  0.0000,  5.1533,  6.0000, 15.0000]),
 tensor([0.0000, 9.0000, 1.0000, 4.8675, 6.0000, 8.0000]),
 tensor([ 0.0000,  6.0000,  0.0000,  5.1417,  7.0000, 14.0000])]

True negatives examples:
* tensor([ 1.0000,  1.0000,  0.0000,  5.2470,  8.0000, 11.0000]),
* tensor([ 1.0000,  4.0000,  0.0000,  5.6937, 23.0000, 12.0000]),
* tensor([ 0.0000,  4.0000,  0.0000,  5.1533,  6.0000, 15.0000]),
* tensor([0.0000, 9.0000, 1.0000, 4.8675, 6.0000, 8.0000]),
* tensor([ 0.0000,  6.0000,  0.0000,  5.1417,  7.0000, 14.0000])

We can see that both wrong examples have more positive words than negative words, which has probably been recognized as a characteristic of positive comments, considering the that the count of positive and negative words seems to be the features that play most while looking at the model's weights.