In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
nltk.download("movie_reviews")

from collections import defaultdict, Counter
import math
import random

random.seed(0) # Don't change
torch.manual_seed(0)  # Don't change
np.random.seed(0) # Don't change


train_X, train_Y = [], []
test_X, test_Y = [], []

for polarity in movie_reviews.categories():
    label = 0 if polarity == 'neg' else 1
    for fid in movie_reviews.fileids(polarity):
        if random.randrange(5) == 0:
            test_X.append([w for w in movie_reviews.words(fid)])
            test_Y.append(label)
        else:
            train_X.append([w for w in movie_reviews.words(fid)])
            train_Y.append(label)

print(train_X[0], train_Y[0])

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'ba

# Assignment II
Doing Assignment II by modifying the following code cell.
Your solution should be based on feedforward neural network (FNN or MLP) with word embeddings.
You are free to adjust the FNN with different dimension settings, vocabulary, overfitting prevention, and so on,
but you can not use other architectures (e.g., CNN/RNN/Transformer or the Naive Bayes classifier from Assignment I) in this assignment.


In [2]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
import spacy
from nltk import ngrams

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


EMBEDDING_DIM = 450
EPOCHS = 12


### Stop Words
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
### Lemmatize
lemmatizer = WordNetLemmatizer()


class TextClassifier(nn.Module):
  def init_embeddings(self, vocab):
    self.word_to_ix = {}
    weights = []
    ix = 0
    for w in vocab:
      self.word_to_ix[w] = ix
      ix += 1
    self.vocab_size = len(self.word_to_ix)
    self.embeddings = nn.EmbeddingBag(self.vocab_size, EMBEDDING_DIM)

  def __init__(self, vocab, classes):
    super(TextClassifier, self).__init__()
    self.classes = classes
    self.init_embeddings(vocab)
    self.fc1 = nn.Linear(self.embeddings.embedding_dim, 50)
    self.fc1.weight.data.uniform_(-0.5, 0.5)
    self.fc1.bias.data.zero_()
    self.fc2 = nn.Linear(50, 20)
    self.fc2.weight.data.uniform_(-0.5, 0.5)
    self.fc2.bias.data.zero_()
    self.out = nn.Linear(20, len(self.classes))
    self.out.weight.data.uniform_(-0.5, 0.5)
    self.out.bias.data.zero_()
    self.relu = nn.ReLU()

  def forward(self, inputs, offsets):
    embedded = self.embeddings(inputs, offsets)
    return self.out(self.relu(self.fc2(self.relu(self.fc1(embedded)))))


def preprocess(tokens, ngram_range):
  # tokens = [token for token in tokens if token not in stop_words]
  tokens = [token.lower() for token in tokens]
  tokens = [lemmatizer.lemmatize(token) for token in tokens]
  ngram_tokens = []
  for n in range(ngram_range[0], ngram_range[1]+1):
      ngrams_n = ngrams(tokens, n)
      ngram_tokens.extend(['_'.join(grams) for grams in ngrams_n])
  return ngram_tokens

def make_doc_vector(doc, word_to_ix):
  doc = preprocess(doc, ngram_range=(1,3))
  idxs = [word_to_ix[w] for w in doc if w in word_to_ix]
  return torch.tensor(idxs, dtype=torch.long)

def generate_batch(batch):
  label = torch.tensor([entry[0] for entry in batch])
  text = [entry[1] for entry in batch]
  offsets = [0] + [len(entry) for entry in text]
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text = torch.cat(text)
  return text, offsets, label

def build_vocab(X, ngram_range=(1,3)):
  word_count = Counter()
  for x in X:
    doc = preprocess(x, ngram_range)
    for w in x:
      word_count[w] += 1
  # The order of keys in a dictionary/set is not deterministic,
  # so sorting in the following statement is important to avoid randomness.
  return [w for (w, c) in sorted(word_count.items()) if c >= 10]

def build_model(X, Y):
  model = TextClassifier(build_vocab(X), [0, 1]).to(device)
  loss_function = nn.CrossEntropyLoss().to(device)
  optimizer = optim.Adam(model.parameters())

  train_set = []
  yc = Counter()
  for x, y in zip(X, Y):
    entry = []
    yc[y] += 1
    entry.append(torch.LongTensor([y]))
    entry.append(make_doc_vector(x, model.word_to_ix))
    train_set.append(entry)
  print(yc)
  data = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=generate_batch)

  for epoch in range(EPOCHS):
    train_loss, train_acc = 0, 0
    print("Epoch: %d" % epoch)
    for _, (x, offsets, y) in enumerate(data):
      model.zero_grad()
      x, offsets, y = x.to(device), offsets.to(device), y.to(device)
      pred = model(x, offsets)
      loss = loss_function(pred, y)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()
      train_acc += (pred.argmax(1) == y).sum().item()
    print("Loss: %g, Acc: %g" % (train_loss / len(train_set), train_acc / len(train_set)))
  return model

[nltk_data] Downloading package wordnet to /root/nltk_data...


### CPU-test

In [3]:
model = build_model(train_X, train_Y)

Counter({1: 803, 0: 775})
Epoch: 0
Loss: 0.0222247, Acc: 0.562738
Epoch: 1
Loss: 0.0183392, Acc: 0.708492
Epoch: 2
Loss: 0.0151704, Acc: 0.775665
Epoch: 3
Loss: 0.0114025, Acc: 0.856147
Epoch: 4
Loss: 0.00765353, Acc: 0.922687
Epoch: 5
Loss: 0.00490019, Acc: 0.959442
Epoch: 6
Loss: 0.00318427, Acc: 0.979087
Epoch: 7
Loss: 0.00187816, Acc: 0.993029
Epoch: 8
Loss: 0.00109686, Acc: 0.998099
Epoch: 9
Loss: 0.000685609, Acc: 0.998733
Epoch: 10
Loss: 0.00044894, Acc: 1
Epoch: 11
Loss: 0.000328015, Acc: 1


In [4]:
def predict(model, document):
  probs = model(make_doc_vector(document, model.word_to_ix).to(device), torch.tensor([0]).cumsum(dim=0).to(device))
  return int(torch.argmax(probs))

print(predict(model, "this is a uninteresting movie".split(" ")))
print(predict(model, "a good movie of this year".split(" ")))


0
0


In [5]:
correct, total = 0, 0

for x, y in zip(test_X, test_Y):
    # print(test_X)
    prediction = predict(model, x)
    if prediction == y:
        correct += 1
    total += 1

print("%d / %d = %g" % (correct, total, correct / total))

343 / 422 = 0.812796


In [281]:
correct, total = 0, 0

for x, y in zip(test_X, test_Y):
    # print(test_X)
    prediction = predict(model, x)
    if prediction == y:
        correct += 1
    total += 1

print("%d / %d = %g" % (correct, total, correct / total))

360 / 422 = 0.853081


### CPU

In [196]:
model = build_model(train_X, train_Y)

Counter({1: 803, 0: 775})
Epoch: 0
Loss: 0.0110251, Acc: 0.532953
Epoch: 1
Loss: 0.0100828, Acc: 0.685044
Epoch: 2
Loss: 0.00910997, Acc: 0.726236
Epoch: 3
Loss: 0.00788481, Acc: 0.776933
Epoch: 4
Loss: 0.00650775, Acc: 0.842839
Epoch: 5
Loss: 0.00507175, Acc: 0.892902
Epoch: 6
Loss: 0.00383135, Acc: 0.940431
Epoch: 7
Loss: 0.00272833, Acc: 0.963878
Epoch: 8
Loss: 0.0019398, Acc: 0.983523
Epoch: 9
Loss: 0.00133499, Acc: 0.989861
Epoch: 10
Loss: 0.00090659, Acc: 0.995564
Epoch: 11
Loss: 0.000611742, Acc: 0.997465


In [170]:
def predict(model, document):
  probs = model(make_doc_vector(document, model.word_to_ix).to(device), torch.tensor([0]).cumsum(dim=0).to(device))
  return int(torch.argmax(probs))

print(predict(model, "this is a uninteresting movie".split(" ")))
print(predict(model, "a good movie of this year".split(" ")))


0
0


## Do Evaluation

In [171]:
correct, total = 0, 0

for x, y in zip(test_X, test_Y):
    # print(test_X)
    prediction = predict(model, x)
    if prediction == y:
        correct += 1
    total += 1

print("%d / %d = %g" % (correct, total, correct / total))

361 / 422 = 0.85545


In [None]:
correct, total = 0, 0

for x, y in zip(test_X, test_Y):
    # print(test_X)
    prediction = predict(model, x)
    if prediction == y:
        correct += 1
    total += 1

print("%d / %d = %g" % (correct, total, correct / total))

364 / 422 = 0.862559


In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
import spacy
from gensim.models import KeyedVectors
import gensim.downloader as api
wv_model = api.load("word2vec-google-news-300")
# wv_model = KeyedVectors.load_word2vec_format('path/to/googlenews-vectors-negative300.bin', binary=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


EMBEDDING_DIM = 250
EPOCHS = 10

### Stop Words
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
# sign_list = [";", ",", ".", "~", ":", "-", "(", ")", "%", "#", "$", "!", "/", "?", "=", "+", "&", "--", "'", '"', '`', '*']
# for sign in sign_list:
#   stop_words.add(sign)
### Lemmatize
lemmatizer = WordNetLemmatizer()


class TextClassifier(nn.Module):
  def init_embeddings(self, vocab):
    self.word_to_ix = {}
    weights = []
    ix = 0
    for w in vocab:
      self.word_to_ix[w] = ix
      ix += 1
    self.vocab_size = len(self.word_to_ix)
    self.embeddings = nn.EmbeddingBag(self.vocab_size, EMBEDDING_DIM)

  def __init__(self, vocab, word_to_vec, classes):
    super(TextClassifier, self).__init__()

    weights_matrix = np.zeros((len(word_to_vec), EMBEDDING_DIM))
    for i, word in enumerate(word_to_vec):
        weights_matrix[i] = word_to_vec[word]
    self.embeddings = nn.EmbeddingBag.from_pretrained(torch.FloatTensor(weights_matrix), freeze=False)

    self.classes = classes
    self.init_embeddings(vocab)
    self.fc1 = nn.Linear(self.embeddings.embedding_dim, 50)
    self.fc1.weight.data.uniform_(-0.5, 0.5)
    self.fc1.bias.data.zero_()
    self.fc2 = nn.Linear(50, 20)
    self.fc2.weight.data.uniform_(-0.5, 0.5)
    self.fc2.bias.data.zero_()
    self.out = nn.Linear(20, len(self.classes))
    self.out.weight.data.uniform_(-0.5, 0.5)
    self.out.bias.data.zero_()
    self.relu = nn.ReLU()

  def forward(self, inputs, offsets):
    embedded = self.embeddings(inputs, offsets)
    return self.out(self.relu(self.fc2(self.relu(self.fc1(embedded)))))


### preprocessing
def preprocess(tokens):
  tokens = [token for token in tokens if token not in stop_words]
  tokens = [token.lower() for token in tokens]
  tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return tokens

def make_doc_vector(doc, word_to_ix):
  doc = preprocess(doc)
  idxs = [word_to_ix[w] for w in doc if w in word_to_ix]
  return torch.tensor(idxs, dtype=torch.long)

def generate_batch(batch):
  label = torch.tensor([entry[0] for entry in batch])
  text = [entry[1] for entry in batch]
  offsets = [0] + [len(entry) for entry in text]
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text = torch.cat(text)
  return text, offsets, label

def build_vocab(X):
  word_count = Counter()
  for x in X:
    for w in x:
      word_count[w] += 1
  # The order of keys in a dictionary/set is not deterministic,
  # so sorting in the following statement is important to avoid randomness.
  vocab = [w for (w, c) in sorted(word_count.items()) if c >= 30]

  word_to_vec = {}
  for w in vocab:
      if w in wv_model:
          word_to_vec[w] = wv_model[w]
      else:
          # 對於不在Word2Vec模型中的詞，可以賦予隨機向量或零向量
          word_to_vec[w] = np.random.uniform(-0.25, 0.25, wv_model.vector_size)

  return vocab, word_to_vec

def build_model(X, Y, wv_model):
  vocab, word_to_vec = build_vocab(X, wv_model)
  model = TextClassifier(vocab, word_to_vec, [0, 1]).to(device)
  loss_function = nn.CrossEntropyLoss().to(device)
  optimizer = optim.Adam(model.parameters())

  train_set = []
  yc = Counter()
  for x, y in zip(X, Y):
    entry = []
    yc[y] += 1
    entry.append(torch.LongTensor([y]))
    entry.append(make_doc_vector(x, model.word_to_ix))
    train_set.append(entry)
  print(yc)
  data = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=generate_batch)

  for epoch in range(EPOCHS):
    train_loss, train_acc = 0, 0
    print("Epoch: %d" % epoch)
    for _, (x, offsets, y) in enumerate(data):
      model.zero_grad()
      x, offsets, y = x.to(device), offsets.to(device), y.to(device)
      pred = model(x, offsets)
      loss = loss_function(pred, y)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()
      train_acc += (pred.argmax(1) == y).sum().item()
    print("Loss: %g, Acc: %g" % (train_loss / len(train_set), train_acc / len(train_set)))
  return model

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




### GPU T4

In [None]:
model = build_model(train_X, train_Y)

Counter({1: 803, 0: 775})
Epoch: 0
Loss: 0.0427771, Acc: 0.576679
Epoch: 1
Loss: 0.0338994, Acc: 0.76109
Epoch: 2
Loss: 0.0235468, Acc: 0.867554
Epoch: 3
Loss: 0.0145542, Acc: 0.930292
Epoch: 4
Loss: 0.0080601, Acc: 0.967681
Epoch: 5
Loss: 0.00418957, Acc: 0.989227
Epoch: 6
Loss: 0.00208948, Acc: 0.996831
Epoch: 7
Loss: 0.00108052, Acc: 0.998733
Epoch: 8
Loss: 0.000602972, Acc: 0.999366
Epoch: 9
Loss: 0.000370786, Acc: 1


In [None]:
def predict(model, document):
  probs = model(make_doc_vector(document, model.word_to_ix).to(device), torch.tensor([0]).cumsum(dim=0).to(device))
  return int(torch.argmax(probs))

print(predict(model, "this is a uninteresting movie".split(" ")))
print(predict(model, "a good movie of this year".split(" ")))

0
1
