In [1]:
!pip install scikit-learn pandas nltk datasets gensim

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp39-cp39-win_amd64.whl.metadata (8.2 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp39-cp39-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ---------------------------------------- 57.6/57.6 kB 3.0 MB/s eta 0:00:00
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp39-cp39-win_amd64.whl.metadata (1


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Preparation

In [2]:
SEED = 42
SUBSET_RATIO = 0.01

## Load IMDB Dataset

In [27]:
from datasets import load_dataset


dataset = load_dataset("imdb")

train_dataset = (
    dataset["train"]
    .shuffle(SEED)
    .select(range(int(len(dataset["train"]) * SUBSET_RATIO)))
)
test_dataset = (
    dataset["test"]
    .shuffle(SEED)
    .select(range(int(len(dataset["test"]) * SUBSET_RATIO)))
)

df = train_dataset.to_pandas()
df.head()

df_test = test_dataset.to_pandas()

In [5]:
df["label"].value_counts()

label
0    129
1    121
Name: count, dtype: int64

## Load English Stop Words

In [28]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')

stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Task: Create a function that given embeddings and labels, trains a classifier and returns the predictions on the test set.

In [29]:
import numpy.typing as npt
import numpy as np
from typing import Iterable
from sklearn.ensemble import RandomForestClassifier

def fit_predict(train_embeddings: npt.NDArray[float], train_labels: Iterable[int], test_embeddings: npt.NDArray[float]) -> npt.NDArray[float]:
  clf = RandomForestClassifier(random_state=0)
  clf.fit(train_embeddings, train_labels)
  predictions = clf.predict(test_embeddings)
  return predictions

# Part 1: Historical Methods


### Bag-of-Words (BoW)

#### Task: Compute the corpus embeddings using a Bag-of-Word

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

#### Task: Use `fit_predict` to train a classifier using the BoW embeddings. Display the accuracy and F1-score.

In [30]:
vectorizer = CountVectorizer(stop_words=stopwords)
train_embeddings = vectorizer.fit_transform(df["text"])
test_embeddings = vectorizer.transform(df_test["text"])

In [31]:
predictions = fit_predict(train_embeddings, df["label"], test_embeddings)

In [33]:
from sklearn.metrics import accuracy_score, f1_score
print(f"accuracy: {accuracy_score(df_test['label'], predictions)}")
print(f"f1: {f1_score(df_test['label'], predictions)}")

accuracy: 0.72
f1: 0.7083333333333334


### TF-IDF

#### Features

Now, let's implement the TF-IDF computation ourselves.

##### Step 1: Tokenization and Count Matrix

First, we need to tokenize the documents and create a term-frequency (TF) matrix.

In [None]:
def tokenize_documents(documents: list[str]) -> list[list[str]]:
    pass

Check that your function is correct

In [None]:
tokenized_documents = tokenize_documents(df['text'])

assert type(tokenized_documents) == list
assert type(tokenized_documents[0]) == list
assert type(tokenized_documents[0][0]) == str
assert len(tokenized_documents) == len(df)

###### Step 2: Build Vocabulary

Create a vocabulary of all unique words in the dataset.

In [None]:
def build_vocabulary(tokenized_documents) -> Iterable[str]:
    pass

Check that your function is correct

In [None]:
from typing import Iterable

vocabulary = build_vocabulary(tokenized_documents)

assert isinstance(vocabulary, Iterable)
assert len(vocabulary) == len(set(vocabulary))
assert len(vocabulary) == 33924 # Can be different depending on the tokenization method used.

###### Step 3: Compute Term Frequencies (TF)

Compute the term frequency matrix.

In [None]:
from collections import Counter
import numpy as np
import numpy.typing as npt

def compute_tf(tokenized_documents: list[list[str]], vocabulary: Iterable[str]) -> npt.NDArray[np.float64]:
    # Get vocabulary size
    vocab_size = len(vocabulary)

    # Create a mapping from word to index
    word_to_index = {word: idx for idx, word in enumerate(vocabulary)}

    # Initialize term frequency matrix
    tf_matrix = np.zeros((len(tokenized_documents), vocab_size))

    # Compute term frequencies
    # Fill me

    return tf_matrix

Check that your function is correct

In [None]:
tf = compute_tf(tokenized_documents, vocabulary)

assert tf.shape == (len(df), len(vocabulary))

##### Step 4: Compute Inverse Document Frequencies (IDF)

Compute the document frequency for each term.

In [None]:
def compute_df(tf_matrix: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
    pass

Check that your function is correct

In [None]:
df_counts = compute_df(tf)

assert df_counts.shape == (len(vocabulary),)
assert np.all(df_counts > 0)

Compute the inverse document frequency for each term.

In [None]:
def compute_idf(tf_matrix: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
    number_of_documents = tf_matrix.shape[0]
    df_counts = compute_df(tf_matrix)

    # Fill me

Check that your function is correct

In [None]:
idf = compute_idf(tf)

assert idf.shape == (len(vocabulary), )

##### Step 5: Compute TF-IDF Matrix and normalize to unit length

Put everything together and compute the TF-IDF of a corpus.

In [None]:
from sklearn.preprocessing import normalize

def tf_idf(documents: list[str]):
    # Tokenize documents
    tokenized_documents = tokenize_documents(documents)

    # Build vocabulary
    vocabulary = build_vocabulary(tokenized_documents)

    # Compute TF
    tf = compute_tf(tokenized_documents, vocabulary)

    # Compute IDF
    idf = compute_idf(tf)

    # Compute TF-IDF
    # Fill me

    # Normalize TF-IDF
    # Fill me (why do we need to normalize?)

    return tf_idf_matrix_normalized

Check that your function is correct

In [None]:
tf_idf_matrix = tf_idf(df['text'])

assert tf_idf_matrix.shape == (len(df), len(vocabulary))

Task: Use fit_predict to train a classifier using TF-IDF embeddings. Display the accuracy and F1-score.

# Part 2: Word Embeddings


In [None]:
import gensim.downloader

glove = gensim.downloader.load("glove-wiki-gigaword-100")

### Task: Create a function that computes a document embedding using GloVe  embeddings and a pooling function of your choice

In [None]:
def document_embedding(document: str) -> npt.NDArray[np.float32]:
    pass

Check that your function is correct

In [None]:
first_document = df['text'].iloc[0]
first_document_embedding =  document_embedding(first_document)

assert first_document_embedding.shape == (glove.vector_size,)

## Task: Train a classifier using GloVe document embeddings