In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 67.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 30.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 4.3 MB/

In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import spacy

import torch
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset

from tqdm import tqdm

import json

In [None]:
dataset = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# normalize the data before entering the model
# we use LinearSVC (LIBLINEAR as backend) as it scales better for large numbers of features and instances
clf_bow = make_pipeline(StandardScaler(), LinearSVC())
clf_embed = make_pipeline(StandardScaler(), LinearSVC())
clf_ptlm = make_pipeline(StandardScaler(), LinearSVC())

y = np.asarray(dataset["train"]["label"])
y_test = np.asarray(dataset["test"]["label"])

In [None]:
# The bag-of-words approach
# Possible extensions: n-gram (local ordering), stop words, reweighting (TF-IDF) (this would partly solve the problem of stop words as well)
# Experiments showed that stop words and TF-IDF actually did harm the performance
# restrict vocabulary size to 10000 to avoid RAM explosion

vectorizer = CountVectorizer(max_features=10000)
X = vectorizer.fit_transform(dataset["train"]["text"]).toarray()

clf_bow.fit(X, y)

X_test = vectorizer.transform(dataset["test"]["text"]).toarray()

print(clf_bow.score(X_test, y_test))



0.81112


In [None]:
# The word embeddings + average pooling approach

!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

glove_file = datapath('/content/glove.6B.50d.txt')
word2vec_glove_file = get_tmpfile("/content/glove.6B.50d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")
X = np.zeros((0, 50))
for sequence in tqdm(dataset["train"]["text"]):
  doc = nlp(sequence.lower())
  avg_vector = np.zeros((1, 50))
  count = 0
  for token in doc:
    try:
      vector = model.wv[token.text]
    except KeyError:
      vector = np.zeros((1, 50)) # represent unrecognized word with zero vector
    avg_vector += vector
    count += 1
  avg_vector /= count
  X = np.concatenate((X, avg_vector), axis=0)

X_test = np.zeros((0, 50))
for sequence in tqdm(dataset["test"]["text"]):
  doc = nlp(sequence.lower())
  avg_vector = np.zeros((1, 50))
  count = 0
  for token in doc:
    try:
      vector = model.wv[token.text]
    except KeyError:
      vector = np.zeros((1, 50))
    avg_vector += vector
    count += 1
  avg_vector /= count
  X_test = np.concatenate((X_test, avg_vector), axis=0)

clf_embed.fit(X, y)
print(clf_embed.score(X_test, y_test))

--2022-11-03 09:03:21--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-11-03 09:03:22--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-11-03 09:07:10 (3.65 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       
2022-11-03 09:

100%|██████████| 25000/25000 [23:17<00:00, 17.89it/s]
100%|██████████| 25000/25000 [22:20<00:00, 18.64it/s]


0.75136




In [None]:
# max pooling

!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

glove_file = datapath('/content/glove.6B.50d.txt')
word2vec_glove_file = get_tmpfile("/content/glove.6B.50d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

X = np.zeros((0, 50))
for sequence in tqdm(dataset["train"]["text"]):
  doc = nlp(sequence.lower())
  max_vector = np.zeros((0, 50))
  for token in doc:
    try:
      vector = np.expand_dims(model.wv[token.text], axis=0)
    except KeyError:
      vector = np.zeros((1, 50)) # represent unrecognized word with zero vector
    max_vector = np.concatenate((max_vector, vector), axis=0)
  max_vector = np.expand_dims(np.amax(max_vector, axis=0), axis=0)
  X = np.concatenate((X, max_vector), axis=0)

X_test = np.zeros((0, 50))
for sequence in tqdm(dataset["test"]["text"]):
  doc = nlp(sequence.lower())
  max_vector = np.zeros((0, 50))
  for token in doc:
    try:
      vector = np.expand_dims(model.wv[token.text], axis=0)
    except KeyError:
      vector = np.zeros((1, 50)) # represent unrecognized word with zero vector
    max_vector = np.concatenate((max_vector, vector), axis=0)
  max_vector = np.expand_dims(np.amax(max_vector, axis=0), axis=0)
  X_test = np.concatenate((X_test, max_vector), axis=0)

clf_embed.fit(X, y)
print(clf_embed.score(X_test, y_test))

--2022-11-03 07:35:42--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-11-03 07:35:42--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-11-03 07:39:15 (3.88 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       
2022-11-03 07:

100%|██████████| 25000/25000 [23:56<00:00, 17.40it/s]
100%|██████████| 25000/25000 [23:11<00:00, 17.96it/s]


0.65544




In [None]:
# the pre-trained language model approach
# Fine-tuning the model would enhance the performance

device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'
print(f"using {device}.")

train_dataloader = DataLoader(dataset["train"]["text"], batch_size=8, shuffle=False)
test_dataloader = DataLoader(dataset["test"]["text"], batch_size=8, shuffle=False)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
model.eval()

X = torch.zeros((0, 768))
for batch in tqdm(train_dataloader):
  inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
  with torch.no_grad():
    outputs_new = model(**inputs).last_hidden_state[:, 0, :] # take [CLS] encoding
    X = torch.cat((X, outputs_new.to("cpu")), dim=0)

X_test = torch.zeros((0, 768))
for batch in tqdm(test_dataloader):
  inputs_test = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
  with torch.no_grad():
    outputs_new = model(**inputs_test).last_hidden_state[:, 0, :]
    X_test = torch.cat((X_test, outputs_new.to("cpu")), dim=0)

X = np.asarray(X)
X_test = np.asarray(X_test)
y = np.asarray(y)
y_test = np.asarray(y_test)

clf_ptlm.fit(X, y)
print(clf_ptlm.score(X_test, y_test))

using 0.


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 25000/25000 [09:50<00:00, 42.34it/s]
100%|██████████| 25000/25000 [09:11<00:00, 45.34it/s]


0.85368


