# Advanced Machine Learning Week 4 Notes
### (We skipped week 3)

## Natural Language Processing

Taking a series of phrases, and tokenizing each individual word, then outputting a list of the tokenized words. From there, each list is assigned a 3-dimensional tensor. Essentially, we are taking each observation in a data set, where each observation is a phrase, and turning them into a tensor.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
from torch.utils.data import Dataset, DataLoader

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
from torchtext.data.utils import get_tokenizer



In [5]:
from torchtext.vocab import build_vocab_from_iterator



In [6]:
from torch.nn.utils.rnn import pad_sequence

In [9]:
# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')
X_news, y_news = newsgroups.data, newsgroups.target

# Preprocess text using CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
# Stop words are the words that are so commonly used that they can be ignored and are meaningless
# These would be words like "a", "is", "the", and others
X_news_vec = vectorizer.fit_transform(X_news).toarray()

# Train-test split
X_train_news, X_test_news, y_train_news, y_test_news = train_test_split(X_news_vec, y_news, test_size=0.2, random_state=42)

In [10]:
# Convert to PyTorch tensors
X_train_news_tensor = torch.tensor(X_train_news, dtype=torch.float32)
y_train_news_tensor = torch.tensor(y_train_news, dtype=torch.long)
X_test_news_tensor = torch.tensor(X_test_news, dtype=torch.float32)
y_test_news_tensor = torch.tensor(y_test_news, dtype=torch.long)

# DataLoader for 20 Newsgroups dataset
train_dataset_news = TensorDataset(X_train_news_tensor, y_train_news_tensor)
test_dataset_news = TensorDataset(X_test_news_tensor, y_test_news_tensor)

batch_size = 64
train_loader_news = DataLoader(train_dataset_news, batch_size=batch_size, shuffle=True)
test_loader_news = DataLoader(test_dataset_news, batch_size=batch_size)

In [11]:
# Define the model
class NLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU() # activation function
        self.softmax = nn.LogSoftmax(dim=1) # activation function
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Initialize the model and define hyperparameters
input_dim_news = X_news_vec.shape[1]
hidden_dim = 128
output_dim_news = len(newsgroups.target_names)  # Number of classes in 20 Newsgroups dataset

model_news = NLPClassifier(input_dim_news, hidden_dim, output_dim_news)

# Define optimizer and loss function
optimizer_news = optim.Adam(model_news.parameters())
criterion = nn.NLLLoss()

In [12]:
# Training loop for 20 Newsgroups dataset
def train_model(model, train_loader, optimizer, criterion, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

# Train the 20 Newsgroups model
train_model(model_news, train_loader_news, optimizer_news, criterion)

Epoch 1/5, Loss: 1.6590041987976785
Epoch 2/5, Loss: 0.80308229665635
Epoch 3/5, Loss: 0.611717043286663
Epoch 4/5, Loss: 0.49422539650636205
Epoch 5/5, Loss: 0.42705816478799963


In [13]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Accuracy: {accuracy}")

# Evaluate the model
evaluate_model(model_news, test_loader_news)

Accuracy: 0.7636604774535809


### With embeddings

Recall that embedding is the processing of converting the tokenized arrays into the three-dimensional tensors

In [14]:
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)


# Define the tokenizer
tokenizer = get_tokenizer('basic_english')

# Tokenize and encode the data using torchtext
train_data_tokens = [tokenizer(text) for text in X_train]
test_data_tokens = [tokenizer(text) for text in X_test]

# Build the vocabulary from the training data
vocab = build_vocab_from_iterator(train_data_tokens, specials=["<unk>"])  # Handle OOV tokens with "<unk>"

In [15]:
train_data_tokens[0] # lists all of the tokens of the first email in order

['from',
 'mahan@tgv',
 '.',
 'com',
 '(',
 'patrick',
 'l',
 '.',
 'mahan',
 ')',
 'subject',
 're',
 'is',
 'it',
 'just',
 'me',
 ',',
 'or',
 'is',
 'this',
 'newsgroup',
 'dead',
 '?',
 'organization',
 'the',
 'internet',
 'lines',
 '24',
 'nntp-posting-host',
 'enterpoop',
 '.',
 'mit',
 '.',
 'edu',
 'to',
 'xpert@expo',
 '.',
 'lcs',
 '.',
 'mit',
 '.',
 'edu',
 ',',
 'rlm@helen',
 '.',
 'surfcty',
 '.',
 'com',
 '#',
 '#',
 'i',
 "'",
 've',
 'gotten',
 'very',
 'few',
 'posts',
 'on',
 'this',
 'group',
 'in',
 'the',
 'last',
 'couple',
 'days',
 '.',
 '(',
 'i',
 '#',
 'recently',
 'added',
 'it',
 'to',
 'my',
 'feed',
 'list',
 '.',
 ')',
 'is',
 'it',
 'just',
 'me',
 ',',
 'or',
 'is',
 'this',
 'group',
 '#',
 'near',
 'death',
 '?',
 '#',
 'seen',
 'from',
 'the',
 'mailing',
 'list',
 'side',
 ',',
 'i',
 "'",
 'm',
 'getting',
 'about',
 'the',
 'right',
 'amount',
 'of',
 'traffic',
 '.',
 'patrick',
 'l',
 '.',
 'mahan',
 '---',
 'tgv',
 'window',
 'washer',
 '--

In [16]:
X_train[0]

"From: mahan@TGV.COM (Patrick L. Mahan)\nSubject: Re: Is it just me, or is this newsgroup dead?\nOrganization: The Internet\nLines: 24\nNNTP-Posting-Host: enterpoop.mit.edu\nTo: xpert@expo.lcs.mit.edu, rlm@helen.surfcty.com\n\n#\n# I've gotten very few posts on this group in the last couple days.  (I\n# recently added it to my feed list.)  Is it just me, or is this group\n# near death?\n#\n\nSeen from the mailing list side, I'm getting about the right amount of\ntraffic.\n\nPatrick L. Mahan\n\n--- TGV Window Washer ------------------------------- Mahan@TGV.COM ---------\n\nWaking a person unnecessarily should not be considered  - Lazarus Long\na capital crime.  For a first offense, that is            From the Notebooks of\n\t\t\t\t\t\t\t  Lazarus Long\n\nPatrick L. Mahan\n\n--- TGV Window Washer ------------------------------- Mahan@TGV.COM ---------\n\nWaking a person unnecessarily should not be considered  - Lazarus Long\na capital crime.  For a first offense, that is            From

In [17]:
# Pad sequences to a fixed length
max_seq_length = 100
train_data_padded = [torch.tensor([vocab[token] if token in vocab else vocab["<unk>"] for token in tokens[:max_seq_length]]) for tokens in train_data_tokens]
test_data_padded = [torch.tensor([vocab[token] if token in vocab else vocab["<unk>"] for token in tokens[:max_seq_length]]) for tokens in test_data_tokens]

X_train_tensor = pad_sequence(train_data_padded, batch_first=True, padding_value=0)
X_test_tensor = pad_sequence(test_data_padded, batch_first=True, padding_value=0)

y_train_tensor = torch.LongTensor(y_train)
y_test_tensor = torch.LongTensor(y_test)

In [18]:
X_train_tensor[0]

tensor([   22, 13640,     1,    47,     8,  2323,   367,     1, 10692,     7,
           39,    46,    13,    16,    84,    70,     2,    34,    13,    25,
          985,   684,    15,    41,     3,   378,    40,   368,    93,  5570,
            1,   418,     1,    30,     5,  5549,     1,  1862,     1,   418,
            1,    30,     2, 33149,     1, 29497,     1,    47,   194,   194,
           11,     4,   139,  2104,   129,   250,  1534,    23,    25,   275,
           12,     3,   198,   652,   516,     1,     8,    11,   194,   881,
         1695,    16,     5,    51,  3660,   305,     1,     7,    13,    16,
           84,    70,     2,    34,    13,    25,   275,   194,   941,   625,
           15,   194,   360,    22,     3,  1495,   305,   612,     2,    11])

In [19]:
vocab.lookup_token(22) # this should be the first word of the email

'from'

In [20]:
X_test_tensor[0] # the first entry in our test dataset

tensor([   22, 85556,     1,   962,     1,    30,     8,  1319, 85555,     7,
           39,    46,  9287,  3200,  1272,    41,   609,   508,     6,   354,
            2,  2782,    40,   304,    93, 13249,     1,   962,     1,    30,
        28659,     1, 10472,     1,    30,     8,  3689, 29261,     7,    62,
         3440,   198,  9287,  3200,   195,    14,    65,   370, 15318,    70,
           10,  2223,     4,    32, 23123,   100,  6184,     1,    11,   175,
            3,  1272,   105,   107,  2690,   713,    31,    25, 17137,  6523,
            2,    36,     3,  9287,  3200,    13,  2690,  2460,     2,    44,
           50,   201,    42,     9,  5921, 40809,   914,   101,  1106, 11377,
         1852,     6,   156, 13039,   188,    18, 20192,   164, 49874,    15])

Note above that a token of 0 in our test data indicates that that word did not appear in our training data!

In [21]:
class NewsGroupsDataset(Dataset):
    def __init__(self, data_tokens, targets, vocab, max_seq_length=100):
        self.data_tokens = data_tokens
        self.targets = targets
        self.vocab = vocab
        self.max_seq_length = max_seq_length
    
    def __len__(self):
        return len(self.data_tokens)
    
    def __getitem__(self, idx):
        tokens = self.data_tokens[idx][:self.max_seq_length]
        token_ids = [self.vocab[token] if token in self.vocab else self.vocab["<unk>"] for token in tokens]
        return torch.tensor(token_ids), torch.tensor(self.targets[idx])

In [22]:
# Create instances of the custom Dataset for training and testing
train_dataset = NewsGroupsDataset(train_data_tokens, y_train, vocab, max_seq_length)
test_dataset = NewsGroupsDataset(test_data_tokens, y_test, vocab, max_seq_length)

In [23]:
train_dataset.__getitem__(0) # again, the first email represented as tokens
# this falls into class 5, whatever that happens to be

(tensor([   22, 13640,     1,    47,     8,  2323,   367,     1, 10692,     7,
            39,    46,    13,    16,    84,    70,     2,    34,    13,    25,
           985,   684,    15,    41,     3,   378,    40,   368,    93,  5570,
             1,   418,     1,    30,     5,  5549,     1,  1862,     1,   418,
             1,    30,     2, 33149,     1, 29497,     1,    47,   194,   194,
            11,     4,   139,  2104,   129,   250,  1534,    23,    25,   275,
            12,     3,   198,   652,   516,     1,     8,    11,   194,   881,
          1695,    16,     5,    51,  3660,   305,     1,     7,    13,    16,
            84,    70,     2,    34,    13,    25,   275,   194,   941,   625,
            15,   194,   360,    22,     3,  1495,   305,   612,     2,    11]),
 tensor(5, dtype=torch.int32))

In [24]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: (pad_sequence([item[0] for item in batch], batch_first=True, padding_value=0), torch.tensor([item[1] for item in batch])))
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=lambda batch: (pad_sequence([item[0] for item in batch], batch_first=True, padding_value=0), torch.tensor([item[1] for item in batch])))

In [25]:
# Define a simple feedforward neural network with an embedding layer
class FFNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(FFNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded_avg = torch.mean(embedded, dim=1)
        x = self.fc1(embedded_avg)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [26]:
y_train_news.max()

19

In [27]:
# Define hyperparameters
vocab_size = len(vocab)
embedding_dim = 500
hidden_dim = 128
#output_dim = len(label_encoder.classes_)
output_dim = y_train_news.max() + 1
learning_rate = 0.01
epochs = 10

# Instantiate the model, loss function, and optimizer
model = FFNN(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [33]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x1ddb133d210>

In [28]:
# Training loop
for epoch in range(epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

RuntimeError: expected scalar type Long but found Int

In [None]:
# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_tensor.numpy(), predicted.numpy())
    print(f'Accuracy on test set: {accuracy}')

In [32]:
#device = torch.device('mps')
#model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

RuntimeError: expected scalar type Long but found Int

## IMDB Dataset

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [9]:
# Load the IMDB dataset (donwnload from canvas)
imdb_data = pd.read_csv('./IMDB Dataset.csv')

In [10]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
X_imdb = imdb_data['review'].values
y_imdb = imdb_data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [15]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_imdb, y_imdb, test_size=0.7, random_state=42)

# Define the tokenizer
tokenizer = get_tokenizer('basic_english')

# Tokenize and encode the data using torchtext
train_data_tokens = [tokenizer(text) for text in X_train]
test_data_tokens = [tokenizer(text) for text in X_test]

# Build the vocabulary from the training data
vocab = build_vocab_from_iterator(train_data_tokens, specials=["<unk>"])  # Handle OOV tokens with "<unk>"

In [27]:
train_data_tokens[0]

['the',
 'fiendish',
 'plot',
 'of',
 'dr',
 '.',
 'fu',
 'manchu',
 '(',
 '1980',
 ')',
 '.',
 'this',
 'is',
 'hands',
 'down',
 'the',
 'worst',
 'film',
 'i',
 "'",
 've',
 'ever',
 'seen',
 '.',
 'what',
 'a',
 'sad',
 'way',
 'for',
 'a',
 'great',
 'comedian',
 'to',
 'go',
 'out',
 '.']

In [20]:
max_seq_length = 100
train_data_padded = [torch.tensor([vocab[token] if token in vocab else vocab["<unk>"] for token in tokens[:max_seq_length]]) for tokens in train_data_tokens]
test_data_padded = [torch.tensor([vocab[token] if token in vocab else vocab["<unk>"] for token in tokens[:max_seq_length]]) for tokens in test_data_tokens]

X_train_tensor = pad_sequence(train_data_padded, batch_first=True, padding_value=0)
X_test_tensor = pad_sequence(test_data_padded, batch_first=True, padding_value=0)

y_train_tensor = torch.LongTensor(y_train)
y_test_tensor = torch.LongTensor(y_test)

In [24]:
X_train_tensor[0]

tensor([    1, 24242,   124,     6,   765,     2,  2191, 15577,    25,  2554,
           23,     2,    13,     9,   940,   195,     1,   241,    22,    12,
            8,   143,   128,   113,     2,    54,     5,   658,   104,    19,
            5,    83,  2618,     7,   152,    51,     2,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [26]:
vocab.lookup_token(24242)

'fiendish'

In [28]:
class IMDBDataset(Dataset):
    def __init__(self, data_tokens, targets, vocab, max_seq_length=100):
        self.data_tokens = data_tokens
        self.targets = targets
        self.vocab = vocab
        self.max_seq_length = max_seq_length
    
    def __len__(self):
        return len(self.data_tokens)
    
    def __getitem__(self, idx):
        tokens = self.data_tokens[idx][:self.max_seq_length]
        token_ids = [self.vocab[token] if token in self.vocab else self.vocab["<unk"] for token in tokens]
        return torch.tensor(token_ids), torch.tensor(self.targets[idx])

In [29]:
# Create instances of the custom Dataset for training and testing
train_dataset = IMDBDataset(train_data_tokens, y_train, vocab, max_seq_length)
test_dataset = IMDBDataset(test_data_tokens, y_test, vocab, max_seq_length)

In [30]:
train_dataset.__getitem__(0)

(tensor([    1, 24242,   124,     6,   765,     2,  2191, 15577,    25,  2554,
            23,     2,    13,     9,   940,   195,     1,   241,    22,    12,
             8,   143,   128,   113,     2,    54,     5,   658,   104,    19,
             5,    83,  2618,     7,   152,    51,     2]),
 tensor(0))

In [31]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: (pad_sequence([item[0] for item in batch], batch_first=True, padding_value=0), torch.tensor([item[1] for item in batch])))
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=lambda batch: (pad_sequence([item[0] for item in batch], batch_first=True, padding_value=0), torch.tensor([item[1] for item in batch])))

In [32]:
class FFNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(FFNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        embedded_avg = torch.mean(embedded, dim=1)
        x = self.fc1(embedded_avg)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [34]:
# Define hyperparameters
vocab_size = len(vocab)
embedding_dim = 500
hidden_dim = 128
#output_dim = len(label_encoder.classes_)
output_dim = y_train.max() + 1
learning_rate = 0.01
epochs = 10

# Instantiate the model, loss function, and optimizer
model = FFNN(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [35]:
# Training loop
for epoch in range(epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Epoch 1/10, Loss: 0.43851688504219055
Epoch 2/10, Loss: 0.4423471689224243
Epoch 3/10, Loss: 0.3188078701496124
Epoch 4/10, Loss: 0.43826207518577576
Epoch 5/10, Loss: 0.3963536024093628
Epoch 6/10, Loss: 0.5216236710548401
Epoch 7/10, Loss: 0.35494187474250793


KeyboardInterrupt: 

In [36]:
# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_tensor.numpy(), predicted.numpy())
    print(f'Accuracy on test set: {accuracy}')

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 7000000000 bytes.

## Pretrained Natural Language Processing