
<div>
<img src="./images/text_embedding.png" width="800"/>
</div>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')
X_news, y_news = newsgroups.data, newsgroups.target

# Preprocess text using CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_news_vec = vectorizer.fit_transform(X_news).toarray()

# Train-test split
X_train_news, X_test_news, y_train_news, y_test_news = train_test_split(X_news_vec, y_news, test_size=0.2, random_state=42)





KeyboardInterrupt: 

### With count vectorization

In [None]:

# Convert to PyTorch tensors
X_train_news_tensor = torch.tensor(X_train_news, dtype=torch.float32)
y_train_news_tensor = torch.tensor(y_train_news, dtype=torch.long)
X_test_news_tensor = torch.tensor(X_test_news, dtype=torch.float32)
y_test_news_tensor = torch.tensor(y_test_news, dtype=torch.long)

# DataLoader for 20 Newsgroups dataset
train_dataset_news = TensorDataset(X_train_news_tensor, y_train_news_tensor)
test_dataset_news = TensorDataset(X_test_news_tensor, y_test_news_tensor)

batch_size = 64
train_loader_news = DataLoader(train_dataset_news, batch_size=batch_size, shuffle=True)
test_loader_news = DataLoader(test_dataset_news, batch_size=batch_size)



In [None]:

# Define the model
class NLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Initialize the model and define hyperparameters
input_dim_news = X_news_vec.shape[1]
hidden_dim = 128
output_dim_news = len(newsgroups.target_names)  # Number of classes in 20 Newsgroups dataset

model_news = NLPClassifier(input_dim_news, hidden_dim, output_dim_news)

# Define optimizer and loss function
optimizer_news = optim.Adam(model_news.parameters())
criterion = nn.NLLLoss()

In [None]:

# Training loop for 20 Newsgroups dataset
def train_model(model, train_loader, optimizer, criterion, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

# Train the 20 Newsgroups model
train_model(model_news, train_loader_news, optimizer_news, criterion)

In [None]:

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Accuracy: {accuracy}")

# Evaluate the model
evaluate_model(model_news, test_loader_news)

### With embeddings

In [None]:
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)


# Define the tokenizer
tokenizer = get_tokenizer('basic_english')

# Tokenize and encode the data using torchtext
train_data_tokens = [tokenizer(text) for text in X_train]
test_data_tokens = [tokenizer(text) for text in X_test]

# Build the vocabulary from the training data
vocab = build_vocab_from_iterator(train_data_tokens, specials=["<unk>"])  # Handle OOV tokens with "<unk>"



In [None]:
train_data_tokens[0]

In [None]:
X_train[0]

In [None]:

# Pad sequences to a fixed length
max_seq_length = 100
train_data_padded = [torch.tensor([vocab[token] if token in vocab else vocab["<unk>"] for token in tokens[:max_seq_length]]) for tokens in train_data_tokens]
test_data_padded = [torch.tensor([vocab[token] if token in vocab else vocab["<unk>"] for token in tokens[:max_seq_length]]) for tokens in test_data_tokens]

X_train_tensor = pad_sequence(train_data_padded, batch_first=True, padding_value=0)
X_test_tensor = pad_sequence(test_data_padded, batch_first=True, padding_value=0)

y_train_tensor = torch.LongTensor(y_train)
y_test_tensor = torch.LongTensor(y_test)


In [None]:
X_train_tensor[0]

In [None]:
vocab.lookup_token(22)

In [None]:
class NewsGroupsDataset(Dataset):
    def __init__(self, data_tokens, targets, vocab, max_seq_length=100):
        self.data_tokens = data_tokens
        self.targets = targets
        self.vocab = vocab
        self.max_seq_length = max_seq_length
    
    def __len__(self):
        return len(self.data_tokens)
    
    def __getitem__(self, idx):
        tokens = self.data_tokens[idx][:self.max_seq_length]
        token_ids = [self.vocab[token] if token in self.vocab else self.vocab["<unk>"] for token in tokens]
        return torch.tensor(token_ids), torch.tensor(self.targets[idx])
        
     

In [None]:
# Create instances of the custom Dataset for training and testing
train_dataset = NewsGroupsDataset(train_data_tokens, y_train, vocab, max_seq_length)
test_dataset = NewsGroupsDataset(test_data_tokens, y_test, vocab, max_seq_length)

In [None]:
train_dataset.__getitem__(0)

In [None]:

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: (pad_sequence([item[0] for item in batch], batch_first=True, padding_value=0), torch.tensor([item[1] for item in batch])))
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=lambda batch: (pad_sequence([item[0] for item in batch], batch_first=True, padding_value=0), torch.tensor([item[1] for item in batch])))



In [None]:

# Define a simple feedforward neural network with an embedding layer
class FFNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(FFNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded_avg = torch.mean(embedded, dim=1)
        x = self.fc1(embedded_avg)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x


In [None]:
y_train_news.max()

In [None]:

# Define hyperparameters
vocab_size = len(vocab)
embedding_dim = 500
hidden_dim = 128
#output_dim = len(label_encoder.classes_)
output_dim = y_train_news.max() + 1
learning_rate = 0.01
epochs = 10

# Instantiate the model, loss function, and optimizer
model = FFNN(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [None]:


# Training loop
for epoch in range(epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')


In [None]:
# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_tensor.numpy(), predicted.numpy())
    print(f'Accuracy on test set: {accuracy}')

In [None]:
'''
Maybe for your version?
if torch.cuda.is_available():
    device = torch.device('cuda')  # Use GPU
else:
    device = torch.device('cpu')  
'''


device = torch.device('mps')
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(5):
    optimizer.zero_grad()
    outputs = model(X_train_tensor.to(device))
    loss = criterion(outputs, y_train_tensor.to(device))
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

In [None]:
'''
Maybe for your version?
if torch.cuda.is_available():
    device = torch.device('cuda')  # Use GPU
else:
    device = torch.device('cpu')  
'''


device = torch.device('mps')
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        loss = criterion(outputs, targets.to(device))
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

In [None]:


device = torch.device('cpu')
model.to(device)

# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_tensor.numpy(), predicted.numpy())
    print(f'Accuracy on test set: {accuracy}')

### IMDB

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [None]:

# Load the IMDB dataset
imdb_data = pd.read_csv('./data/IMDB Dataset.csv')

In [None]:
imdb_data.head()

In [None]:

X_imdb = imdb_data['review'].values
y_imdb = imdb_data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values


In [None]:

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_imdb, y_imdb, test_size=0.7, random_state=42)

# Define the tokenizer
tokenizer = get_tokenizer('basic_english')

# Tokenize and encode the data using torchtext
train_data_tokens = [tokenizer(text) for text in X_train]
test_data_tokens = [tokenizer(text) for text in X_test]

# Build the vocabulary from the training data
vocab = build_vocab_from_iterator(train_data_tokens, specials=["<unk>"])  # Handle OOV tokens with "<unk>"



#### Exercise

Take some time and create a dataset and model class for the binary classification task above. See what challenges you encounter. See what performance you can get.