This is our Embedding model, designed to compete with our TF-IDF model. We will take the more accurate model for our final product, but we wanted to explore both options and test them against each other.

In [None]:
from string import punctuation
import pandas as pd
import numpy as np
import torch

print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
df = pd.read_csv("billsTrain.csv")
if df.isnull().values.any():
  df = df.dropna()
df.head()

Unnamed: 0,Title,Major,Minor
1880,A bill to establish a new program of health ca...,3.0,301.0
1881,An Act to provide for pension reform.,5.0,503.0
1882,A bill to provide for the regulation of surfac...,8.0,805.0
1883,A bill to provide that meetings of Government ...,2.0,208.0
1884,A bill to amend the Internal Revenue Code of 1...,6.0,601.0


In [None]:
inputs = df['Title']
labelsMajor = df['Major']
labelsMinor = df['Minor']

In [None]:
# Process each title individually to remove punctuation
inputs = inputs.str.replace(r'\.(?!\d)', '. ')

processed_titles = []
for title in inputs:
    clean_title = ''.join([char for char in title if char not in punctuation])
    processed_titles.append(clean_title)

# Combine all processed words into a single list for vocabulary building
# This ensures the vocabulary is built from all unique words across all titles
all_text_for_vocab = ' '.join(processed_titles)
words = all_text_for_vocab.split()
words[:50]

['A',
 'bill',
 'to',
 'establish',
 'a',
 'new',
 'program',
 'of',
 'health',
 'care',
 'delivery',
 'and',
 'comprehensive',
 'health',
 'care',
 'delivery',
 'and',
 'comprehensive',
 'health',
 'care',
 'benefits',
 'including',
 'catastrophic',
 'coverage',
 'to',
 'be',
 'available',
 'to',
 'aged',
 'persons',
 'and',
 'to',
 'employed',
 'unemployed',
 'and',
 'lowincome',
 'individuals',
 'at',
 'a',
 'cost',
 'related',
 'to',
 'their',
 'income',
 'An',
 'Act',
 'to',
 'provide',
 'for',
 'pension']

In [None]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
# stats about vocabulary
print('Unique words: ', len((vocab))) # should ~ 74000+
print()
vocab[-10:]
## Build a dictionary that maps words to integers
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)} # here 1 means the index starts from 1

Unique words:  21047



In [None]:
words_ints = []
for title_text in processed_titles: # Iterate over the correctly processed titles
  if title_text.strip(): # Check if the cleaned title is not empty
    words_ints.append([vocab_to_int[word] for word in title_text.split() if word in vocab_to_int])
  else:
    words_ints.append([]) # Append an empty list for titles that became empty after cleaning

type(words_ints)

list

In [None]:
# Define non_zero_idx by filtering out empty sequences from words_ints
non_zero_idx = [i for i, seq in enumerate(words_ints) if len(seq) > 0]

words_ints = [words_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([labelsMajor.iloc[ii] for ii in non_zero_idx])
print('Number of Bills after removing outliers: ', len(words_ints))

Number of Bills after removing outliers:  59757


We've now cleaned our inputs. We removed bills with N/As, removed punctuation, and created a vocabulary list of words.

In [None]:
def pad_features(words_ints, seq_length):
  features = np.zeros((len(words_ints), seq_length), dtype=int)
  for i, row in enumerate(words_ints):
    if len(row) > 0: # Only assign if the row is not empty
      to_assign = np.array(row)[:seq_length]
      features[i, -len(to_assign):] = to_assign
  return features

In [None]:
seq_length = 50
features = pad_features(words_ints, seq_length=seq_length)

The vast majority of bills are shorter than 50 words, but we padded it to 50 because some observed entries are 20 or 30-odd words long. There may be longer bills too, and 50 seemed relatively safe.

In [None]:
features

array([[   0,    0,    0, ...,    1,   60,   45],
       [   0,    0,    0, ...,    7,  140,  510],
       [   0,    0,    0, ...,    7,   21,   22],
       ...,
       [   0,    0,    0, ...,  935,    9,  878],
       [   0,    0,    0, ...,    7,   21,   22],
       [   0,    0,    0, ...,    6, 3427,  874]])

In [None]:
labelsMajor.head()

Unnamed: 0,Major
1880,3.0
1881,5.0
1882,8.0
1883,2.0
1884,6.0


In [None]:
split_frac = 0.8
# We now split data into training, validation, and test data (features and labels, x and y)
split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]
test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]
## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
"\nValidation set: \t{}".format(val_x.shape),
"\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(47805, 50) 
Validation set: 	(5976, 50) 
Test set: 		(5976, 50)


Well, that's the data prep. Most of it is very similar to what we did for Transformers, since that's what I based it off of...

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [None]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
# dataloaders
batch_size = 64
# make sure we SHUFFLE that training data
# drop_last=True will drop the last batch if the size is less than the given batch_size
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([64, 50])
Sample input: 
 tensor([[    0,     0,     0,  ...,   424,   234,   874],
        [    0,     0,     0,  ..., 15919, 15920,  1543],
        [    0,     0,     0,  ...,  2371,    27,   466],
        ...,
        [    0,     0,     0,  ...,    18,   258,    61],
        [    0,     0,     0,  ...,   362,     1,   187],
        [    0,     0,     0,  ...,    39,    17,   773]])

Sample label size:  torch.Size([64])
Sample label: 
 tensor([ 1., 99., 21.,  9.,  6., 99.,  8.,  5., 99.,  3., 10., 20.,  4., 10.,
         6.,  7.,  9.,  5.,  8., 20., 15., 15., 20., 12., 15., 20., 99., 18.,
        99., 12., 12.,  7., 13., 10., 12.,  1., 12.,  7.,  8.,  8., 15., 13.,
         8., 12., 13.,  1.,  7., 99., 14.,  7., 20., 16., 12.,  8., 18., 20.,
        13., 16.,  4.,  4., 16., 20., 20., 20.], dtype=torch.float64)


In [None]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()
if(train_on_gpu):
  print('Training on GPU.')
else:
  print('No GPU available, training on CPU.')

Training on GPU.


In [None]:
class PositionalEmbedding(nn.Module):

  def __init__(self, sequence_length, input_dim, output_dim):
    super(PositionalEmbedding, self).__init__()

    # Use standard nn.Embedding for token embeddings, with padding_idx=0
    self.token_embeddings = nn.Embedding(input_dim, output_dim, padding_idx=0)

    # Use standard nn.Embedding for positional embeddings
    # The weights will be randomly initialized by default
    self.position_embeddings = nn.Embedding(sequence_length, output_dim)

    self.sequence_length = sequence_length

  def forward(self, inputs):
    device = inputs.device
    # Create position indices for the entire sequence length
    positions = torch.arange(0, self.sequence_length).unsqueeze(0).to(device)

    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions = self.position_embeddings(positions)

    # Add token embeddings and positional embeddings
    embedded_combined = embedded_tokens + embedded_positions
    return embedded_combined

In [None]:
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
embed_dim = 512
# Define the embedding layer with positional information
input_embed=PositionalEmbedding(seq_length, vocab_size, embed_dim)
input_embed
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)
sample_x.shape, sample_y.shape
sample_emd=input_embed(sample_x)

In [None]:
output_size = 23
embedding_dim = 1024
hidden_dim = 512
n_layers = 3

class PredictRNN(nn.Module):
    """
    The RNN model that will be used to perform prediction of categories.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(PredictRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        #TODO: add embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                            dropout=drop_prob, batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear layer - removed sigmoid
        self.fc = nn.Linear(hidden_dim, output_size)


    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)


        x = x.long()

        #TODO: compute embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out[:, -1, :] # getting the last time step output

        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out) # output raw logits

        # return last output and hidden state
        return out, hidden


    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden

net = PredictRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

PredictRNN(
  (embedding): Embedding(21048, 1024)
  (lstm): LSTM(1024, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=23, bias=True)
)


In [None]:
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)
sample_x.shape, sample_y.shape

(torch.Size([64, 50]), torch.Size([64]))

In [None]:
sample_emd=input_embed(sample_x)
sample_emd.shape

torch.Size([64, 50, 512])

In [None]:
class TransformerEncoder(nn.Module):

  def __init__(self, embed_dim, dense_dim, num_heads):
    super(TransformerEncoder, self).__init__()
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads

#Input and output both have size (batch_size, seq_len, embed_dim)
    self.attention = nn.MultiheadAttention(embed_dim,num_heads,batch_first=True)
#TODO: define a two-layer Feed-forward network with hidden layer size dense_dim and output layer size embed_
    self.dense_proj = nn.Sequential(
    nn.Linear(embed_dim, dense_dim),
    nn.ReLU(),
    nn.Linear(dense_dim, embed_dim)
    )
#TODO: define two layer normalization layers
    self.layernorm_1 = nn.LayerNorm(embed_dim)
    self.layernorm_2 = nn.LayerNorm(embed_dim)

  def forward(self, inputs):
    attention_output, _  = self.attention(inputs, inputs, inputs)
    proj_input = inputs + attention_output
    proj_output = self.dense_proj(proj_input)
    return self.layernorm_2(proj_input + proj_output)

In [None]:
class TransformerEncoderModel(nn.Module):

  def __init__(self, vocab_size, embed_dim, num_heads, dense_dim, sequence_length):
    super(TransformerEncoderModel, self).__init__()
    self.embedding = PositionalEmbedding(sequence_length,vocab_size,embed_dim)
    self.transformer_encoder =TransformerEncoder(embed_dim,dense_dim,num_heads)
    self.dropout = nn.Dropout(0.2)
    self.fc = nn.Linear(embed_dim, 22)
    self.sigmoid = nn.Sigmoid()

  def forward(self, inputs):
    x = self.embedding(inputs)
#TODO: compute the transformer output
    x = self.transformer_encoder(x) # x has shape (Batch, Seq_Len, Embed_dim)
    x,_ = torch.max(x, dim=1) # x has shape (Batch, Embd_dim)
    x = self.dropout(x) # pass dropout layer
    x = self.fc(x) # pass a linear layer
    return self.sigmoid(x) # pass sigmoid activation

In [None]:
import numpy as np

num_heads = 2
dense_dim = 1024

num_classes = int(np.max(encoded_labels))

class TransformerEncoderModel(nn.Module):

  def __init__(self, vocab_size, embed_dim, num_heads, dense_dim, sequence_length, num_classes):
    super(TransformerEncoderModel, self).__init__()
    self.embedding = PositionalEmbedding(sequence_length,vocab_size,embed_dim)
    self.transformer_encoder =TransformerEncoder(embed_dim,dense_dim,num_heads)
    self.dropout = nn.Dropout(0.2)
    self.fc = nn.Linear(embed_dim, num_classes) # Correct output size
    # self.sigmoid = nn.Sigmoid() # Removed sigmoid for CrossEntropyLoss

  def forward(self, inputs):
    x = self.embedding(inputs)
    x = self.transformer_encoder(x) # x has shape (Batch, Seq_Len, Embed_dim)
    x,_ = torch.max(x, dim=1) # x has shape (Batch, Embd_dim)
    x = self.dropout(x) # pass dropout layer
    x = self.fc(x) # pass a linear layer
    return x # Return logits directly

model = TransformerEncoderModel(vocab_size, embed_dim, num_heads, dense_dim, seq_length, num_classes)
print(model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 35

TransformerEncoderModel(
  (embedding): PositionalEmbedding(
    (token_embeddings): Embedding(21048, 512, padding_idx=0)
    (position_embeddings): Embedding(50, 512)
  )
  (transformer_encoder): TransformerEncoder(
    (attention): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (dense_proj): Sequential(
      (0): Linear(in_features=512, out_features=1024, bias=True)
      (1): ReLU()
      (2): Linear(in_features=1024, out_features=512, bias=True)
    )
    (layernorm_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (layernorm_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=99, bias=True)
)


We set the number of epochs to 35, but in a real testing environment this would probably be lower since we seem to start overfitting around epoch 15.

In [None]:
def train(model, train_loader, val_loader, num_epochs=10, train_on_gpu=False):
  # Move model to GPU if available before optimizer initialization
  if train_on_gpu:
    model.cuda()

  optimizer = torch.optim.Adam(model.parameters())

  for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for inputs, labels in train_loader:
      if(train_on_gpu):
        inputs,labels = inputs.cuda(), labels.cuda()
      # Adjust labels to be 0-indexed for CrossEntropyLoss if they are 1-indexed
      # and ensure they are Long type.
      labels = (labels - 1).long()

      outputs = model(inputs) # Model now outputs (batch_size, num_classes) logits, no squeeze(1)
      loss = criterion(outputs, labels) # Pass Long type labels
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {total_loss / len(train_loader)}")

      ### Validation Loop ###
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
      for inputs, labels in valid_loader:
        if(train_on_gpu):
          inputs, labels= inputs.cuda(), labels.cuda()
        labels = (labels - 1).long() # Adjust labels for validation data too

        outputs = model(inputs) # Get logits
        _, predicted = torch.max(outputs, 1) # Get predicted class index for multi-class

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        val_acc = 100 * correct / total
      print(f"Epoch {epoch + 1}/{num_epochs}, Validation Accuracy: {val_acc:.2f}%")

In [None]:
train(model, train_loader, valid_loader, num_epochs, train_on_gpu)

Epoch 1/35, Training Loss: 1.5320034976299584
Epoch 1/35, Validation Accuracy: 67.36%
Epoch 2/35, Training Loss: 0.738849476459199
Epoch 2/35, Validation Accuracy: 73.81%
Epoch 3/35, Training Loss: 0.5400345933001898
Epoch 3/35, Validation Accuracy: 76.81%
Epoch 4/35, Training Loss: 0.4569833791807254
Epoch 4/35, Validation Accuracy: 78.02%
Epoch 5/35, Training Loss: 0.37111592278803324
Epoch 5/35, Validation Accuracy: 78.78%
Epoch 6/35, Training Loss: 0.3364115332630301
Epoch 6/35, Validation Accuracy: 78.81%
Epoch 7/35, Training Loss: 0.30540328973220116
Epoch 7/35, Validation Accuracy: 78.18%
Epoch 8/35, Training Loss: 0.2926688487004818
Epoch 8/35, Validation Accuracy: 78.48%
Epoch 9/35, Training Loss: 0.25770867520059404
Epoch 9/35, Validation Accuracy: 79.10%
Epoch 10/35, Training Loss: 0.22520568766868387
Epoch 10/35, Validation Accuracy: 78.85%
Epoch 11/35, Training Loss: 0.21289917920284154
Epoch 11/35, Validation Accuracy: 79.89%
Epoch 12/35, Training Loss: 0.2053865227029326

We tried modifying some hyperparameters in our testing, such as num_heads, embedding_dim, and epoches, but we never really got above mid-80s % accuracy. We present this version because it is the latest one we have.

In [None]:
def predict_category(text, model, vocab_to_int, seq_length, train_on_gpu):
    # 1. Preprocess the input text
    clean_text = ''.join([char for char in text if char not in punctuation])
    words = clean_text.split()

    # Convert words to integers, handling out-of-vocabulary words by skipping them
    text_ints = [vocab_to_int[word] for word in words if word in vocab_to_int]

    if not text_ints:
        print("No known words in the input text after preprocessing.")
        return None

    # 2. Pad the sequence
    # The pad_features function expects a list of lists, so wrap text_ints
    padded_features = pad_features([text_ints], seq_length=seq_length)

    # Convert to PyTorch tensor
    input_tensor = torch.from_numpy(padded_features).long()

    # Move to GPU if available
    if train_on_gpu:
        input_tensor = input_tensor.cuda()

    # 3. Set model to evaluation mode
    model.eval()

    # 4. Perform inference
    with torch.no_grad():
        output_logits = model(input_tensor)

    # 5. Interpret the output
    # Apply softmax to get probabilities and get the predicted class index
    probabilities = torch.softmax(output_logits, dim=1)
    _, predicted_index = torch.max(probabilities, 1)

    # Adjust the predicted index back to the original 1-indexed label
    predicted_label = predicted_index.item() + 1

    return predicted_label

# Sample text to test the model
sample_bill_title = "An Act to Protect Hawai'is Coastal Waters from Overfishing"

# Get prediction
predicted_major_category = predict_category(
    sample_bill_title, model, vocab_to_int, seq_length, train_on_gpu
)

if predicted_major_category is not None:
    print(f"Sample Bill Title: {sample_bill_title}")
    print(f"Predicted Major Category: {predicted_major_category}")

# Test with another example (from df for comparison)
sample_from_df = df['Title'].iloc[0]
original_label = labelsMajor.iloc[0]
predicted_from_df = predict_category(
    sample_from_df, model, vocab_to_int, seq_length, train_on_gpu
)

if predicted_from_df is not None:
    print(f"\nSample from Dataset: {sample_from_df}")
    print(f"Original Major Category: {original_label}")
    print(f"Predicted Major Category: {predicted_from_df}")

Sample Bill Title: An Act to Protect Hawai'is Coastal Waters from Overfishing
Predicted Major Category: 4

Sample from Dataset: A bill to establish a new program of health care delivery and comprehensive health care delivery and comprehensive health care benefits (including catastrophic coverage), to be available to aged persons, and to employed, unemployed, and low-income individuals, at a cost related to their income.
Original Major Category: 3.0
Predicted Major Category: 3


For the sake of comparison with the other models, notably TF-IDF, we wanted to sample some errors and evaluate what they were like.

In [None]:
mismatched_records = []
count_mismatches = 0
max_mismatches = 30

# Calculate the starting index in the original filtered data for the test set
# This maps the 0-indexed test_x/test_y indices back to the original `features` array indices
start_index_for_test_set_in_filtered_data = split_idx + test_idx

print("Searching for mismatched predictions in the test set...")
for i in range(len(test_x)):
    if count_mismatches >= max_mismatches:
        break

    # Get the actual original index from df for the current test sample
    # This uses the `non_zero_idx` to map back to the original df row index
    original_df_index_for_current_test_sample = non_zero_idx[start_index_for_test_set_in_filtered_data + i]

    # Get the original title from the dataframe using the mapped index
    original_title = df['Title'].iloc[original_df_index_for_current_test_sample]

    # Get the true major category (these are already 1-indexed from encoded_labels)
    true_major_category = test_y[i]

    # Predict the category using the defined function
    predicted_major_category = predict_category(
        original_title, model, vocab_to_int, seq_length, train_on_gpu
    )

    # Compare prediction with true label
    # predict_category returns 1-indexed labels, and true_major_category is also 1-indexed
    if predicted_major_category is not None and predicted_major_category != true_major_category:
        mismatched_records.append({
            "Original Title": original_title,
            "Original Major Category": int(true_major_category),
            "Predicted Major Category": int(predicted_major_category)
        })
        count_mismatches += 1

if mismatched_records:
    print(f"\nFound {len(mismatched_records)} records where prediction did not match the original major category:")
    for record in mismatched_records:
        print("-" * 70)
        print(f"Original Title: {record['Original Title']}")
        print(f"Original Major Category: {record['Original Major Category']}")
        print(f"Predicted Major Category: {record['Predicted Major Category']}")
else:
    print("\nNo mismatches found in the processed test samples, or less than 10 records were processed.")


Searching for mismatched predictions in the test set...

Found 30 records where prediction did not match the original major category:
----------------------------------------------------------------------
Original Title: A bill to amend the Public Health Service Act to provide for revision of the National Institute on Aging.
Original Major Category: 13
Predicted Major Category: 3
----------------------------------------------------------------------
Original Title: A bill to defer from income certain amounts deferred pursuant to State or local public employee deferred compensation plans.
Original Major Category: 5
Predicted Major Category: 20
----------------------------------------------------------------------
Original Title: A bill to authorize the establishment of an international emergency wheat reserve, and for other related purposes.
Original Major Category: 4
Predicted Major Category: 18
----------------------------------------------------------------------
Original Title: A bi

In [None]:
model_path = 'transformer_model.pt'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to transformer_model.pt


We also built, trained, and evaluated a transformer model for minor category prediction, with an adjusted output layer with more dimensions.

In [None]:
encoded_labels_minor = np.array([labelsMinor.iloc[ii] for ii in non_zero_idx])

train_y_minor = encoded_labels_minor[:split_idx]
remaining_y_minor = encoded_labels_minor[split_idx:]

val_y_minor = remaining_y_minor[:test_idx]
test_y_minor = remaining_y_minor[test_idx:]

print("Minor Label Shapes:")
print("Train set: \t\t{}".format(train_y_minor.shape),
      "\nValidation set: \t{}".format(val_y_minor.shape),
      "\nTest set: \t\t{}".format(test_y_minor.shape))

Minor Label Shapes:
Train set: 		(47805,) 
Validation set: 	(5976,) 
Test set: 		(5976,)


In [None]:
import torch.nn as nn
import torch.optim as optim

# 1. Determine the total number of unique minor categories from encoded_labels_minor
# The instruction specifies to add 1 because the labels are 1-indexed, and to use np.max.
# This assumes the maximum label value directly corresponds to the number of classes when 0-indexed.
# If the minor labels were sequential 1, 2, ..., N, then int(np.max(labels)) would be N.
# Since they are codes (e.g., 301, 503), the model will learn to predict logits for a sparse range up to max_minor_code+1.
num_minor_classes = int(np.max(encoded_labels_minor)) + 1
print(f"Number of minor classes: {num_minor_classes}")

# 2. Create a new instance of the TransformerEncoderModel for minor categories
# Use the previously defined vocab_size, embed_dim, num_heads, dense_dim, and seq_length
minor_model = TransformerEncoderModel(
    vocab_size,
    embed_dim,
    num_heads,
    dense_dim,
    seq_length,
    num_minor_classes
)

# 3. Print the minor_model to inspect its architecture
print(minor_model)

# 4. Define the loss function for minor_model
minor_criterion = nn.CrossEntropyLoss()

# 5. Define the optimizer for minor_model
minor_optimizer = optim.Adam(minor_model.parameters(), lr=0.001)

Number of minor classes: 10000
TransformerEncoderModel(
  (embedding): PositionalEmbedding(
    (token_embeddings): Embedding(21048, 512, padding_idx=0)
    (position_embeddings): Embedding(50, 512)
  )
  (transformer_encoder): TransformerEncoder(
    (attention): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (dense_proj): Sequential(
      (0): Linear(in_features=512, out_features=1024, bias=True)
      (1): ReLU()
      (2): Linear(in_features=1024, out_features=512, bias=True)
    )
    (layernorm_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (layernorm_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=10000, bias=True)
)


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Create Tensor datasets for minor categories
train_data_minor = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y_minor))
valid_data_minor = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y_minor))

# Dataloaders for minor categories
batch_size = 64

train_loader_minor = DataLoader(train_data_minor, shuffle=True, batch_size=batch_size, drop_last=True)
valid_loader_minor = DataLoader(valid_data_minor, shuffle=True, batch_size=batch_size, drop_last=True)

print("Minor category data loaders created.")

# Train the minor category model
def train_minor(model, train_loader, val_loader, num_epochs, train_on_gpu, criterion, optimizer):
  # Move model to GPU if available
  if train_on_gpu:
    model.cuda()

  for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for inputs, labels in train_loader:
      if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
      # Adjust labels to be 0-indexed for CrossEntropyLoss (if they are 1-indexed)
      labels = (labels - 1).long() # Assuming labels are 1-indexed

      outputs = model(inputs)
      loss = criterion(outputs, labels)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {total_loss / len(train_loader)}")

      ### Validation Loop ###
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
      for inputs, labels in val_loader:
        if(train_on_gpu):
          inputs, labels = inputs.cuda(), labels.cuda()
        labels = (labels - 1).long() # Adjust labels for validation data too

        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        val_acc = 100 * correct / total
      print(f"Epoch {epoch + 1}/{num_epochs}, Validation Accuracy: {val_acc:.2f}%")


train_minor(minor_model, train_loader_minor, valid_loader_minor, num_epochs, train_on_gpu, minor_criterion, minor_optimizer)

Minor category data loaders created.
Epoch 1/35, Training Loss: 3.377789859158104
Epoch 1/35, Validation Accuracy: 41.68%
Epoch 2/35, Training Loss: 1.7772896563878966
Epoch 2/35, Validation Accuracy: 54.42%
Epoch 3/35, Training Loss: 1.2042789970582037
Epoch 3/35, Validation Accuracy: 61.16%
Epoch 4/35, Training Loss: 0.9098488804322465
Epoch 4/35, Validation Accuracy: 62.01%
Epoch 5/35, Training Loss: 0.7730387424096345
Epoch 5/35, Validation Accuracy: 64.05%
Epoch 6/35, Training Loss: 0.6642481481541578
Epoch 6/35, Validation Accuracy: 64.26%
Epoch 7/35, Training Loss: 0.5734091485992833
Epoch 7/35, Validation Accuracy: 65.36%
Epoch 8/35, Training Loss: 0.5225439204287593
Epoch 8/35, Validation Accuracy: 66.16%
Epoch 9/35, Training Loss: 0.48494999662001714
Epoch 9/35, Validation Accuracy: 66.57%
Epoch 10/35, Training Loss: 0.4375516613770586
Epoch 10/35, Validation Accuracy: 66.60%
Epoch 11/35, Training Loss: 0.42812554054861096
Epoch 11/35, Validation Accuracy: 66.60%
Epoch 12/35,

We see that the validation accuracy for this model peaks around epoch 20, implying that the model begins to overfit soon after that. We also see that the overall accuracy is lower than that of the TF-IDF model. We did some optimization on this model, and while we are confident it is not perfect, we think that the TF-IDF model is the better choice.