# NMSU CSCI-5435 Assignment 3 Task 2

## Relevent Information

In [1]:
#Name:               Tianjie Chen
#Email:              tvc5586@nmsu.edu
#File Creation Date: Feb/27/2025
#Purpose of File:    NMSU CSCI-5435 Assignment 3 Task 2
#Last Edit Date:     Feb/27/2025
#Last Edit Note:     File creation
#GenAI used:         False

## Load Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import gensim.downloader
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from torch.autograd import Variable

## Setup

In [3]:
# USING GPU
print(torch.cuda.device_count())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

1


In [4]:
batch_size = 1024  # BATCH SIZE FOR THIS MODEL
epochs     = 20   # Number of training epochs

In [5]:
DATA_PATH = "News_Category_Dataset_v2.json"

train_data = pd.read_json(DATA_PATH, lines = True)['short_description']

## Preprocessing

In [6]:
train_data = train_data.str.lower()
train_data = train_data.str.replace("-", " ", regex=True)
train_data = train_data.str.replace(r"[^'\&\w\s]", "", regex=True)
train_data = train_data.str.strip()
train_data = [" ".join(["<start>", x, "<end>"]) for x in train_data]

In [7]:
print(train_data[0])

<start> she left her husband he killed their children just another day in america <end>


In [25]:
print(len(train_data))

200853


## Use Pre-trained Tokenizer

### Tokenization

In [8]:
from transformers import BertTokenizerFast

###
# define Vocab
###
class Vocab:
    def __init__(self, list_of_sentence, tokenization, special_token, max_tokens=None):
        # count vocab frequency
        vocab_freq = {}
        tokens = tokenization(list_of_sentence)
        for t in tokens:
            for vocab in t:
                if vocab not in vocab_freq:
                    vocab_freq[vocab] = 0 
                vocab_freq[vocab] += 1
        # sort by frequency
        vocab_freq = {k: v for k, v in sorted(vocab_freq.items(), key=lambda i: i[1], reverse=True)}
        # create vocab list
        self.vocabs = special_token + list(vocab_freq.keys())
        if max_tokens:
            self.vocabs = self.vocabs[:max_tokens]
        self.stoi = {v: i for i, v in enumerate(self.vocabs)}

    def _get_tokens(self, list_of_sentence):
        for sentence in list_of_sentence:
            tokens = tokenizer.tokenize(sentence)
            yield tokens

    def get_itos(self):
        return self.vocabs

    def get_stoi(self):
        return self.stoi

    def append_token(self, token):
        self.vocabs.append(token)
        self.stoi = {v: i for i, v in enumerate(self.vocabs)}

    def __call__(self, list_of_tokens):
        def get_token_index(token):
            if token in self.stoi:
                return self.stoi[token]
            else:
                return 0
        return [get_token_index(t) for t in list_of_tokens]

    def __len__(self):
        return len(self.vocabs)

###
# generate Vocab
###
max_word = 50000

# create tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Must manually add the start and end tokens, otherwise
# the tokenizer will separate them into three tokens
tokenizer.add_tokens(["<start>", "<end>"])

# define tokenization function
def yield_tokens(data):
    for text in data:
        tokens = tokenizer.tokenize(text)
        yield tokens

# build vocabulary list
vocab = Vocab(
    train_data,
    tokenization=yield_tokens,
    special_token=["<unk>"],
    max_tokens=max_word,
)

# get list for index-to-word, and word-to-index.
itos = vocab.get_itos()
stoi = vocab.get_stoi()

In [9]:
print(itos)



In [10]:
print(stoi)



### Train-Test Split

In [21]:
seq_len = 100 + 1
input_seq = []
for s in train_data:
    token_list = vocab(tokenizer.tokenize(s))
    for i in range(seq_len, len(token_list) + 1):
        seq_list = token_list[i-seq_len:i]
        input_seq.append(seq_list)
print("The number of training input sequence :{}".format(len(input_seq)))
input_seq = np.array(input_seq)

The number of training input sequence :16921


In [22]:
X, y = input_seq[:,:-1], input_seq[:,-1]

In [23]:
print(X[0])

[    1    25   134     3   647   482    44   501  2061     4   785  1182
    92     4     3    84  1491     6 11620  1124  1158     9    58   434
    24    73   271    19   113  3105  2569  4144     3   306   663  2971
    57  3105  2569     9   468    12    48   685    18     4  1717    12
    14   846    32  1754    64    94    80    13 11277     5   134     7
   281   123   229  6205  3553    21    19   113    40    79   449    60
     4     3  3677  1009  2477   712     4   228    45    68   948  2147
    27    39 14563   179     3    84     9    59  2838     6  3032 11220
  1457    29   521   521]


In [14]:
print(vocab)

<__main__.Vocab object at 0x7e623c5710a0>


### Configure Model

In [None]:
embedding_dim = 128

class SimpleLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim=512):
        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size,
            embedding_dim,
        )
        self.hidden = nn.Linear(embedding_dim*(seq_len - 1), hidden_dim)
        self.hidden_l = nn.Linear(hidden_dim, hidden_dim)
        self.classify = nn.Linear(hidden_dim, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        outs = self.embedding(inputs)
        outs = torch.flatten(outs, start_dim=1)
        outs = self.hidden(outs)
        outs = self.relu(outs)
        outs = self.hidden_l(outs)
        outs = self.relu(outs)
        outs = self.hidden_l(outs)
        outs = self.relu(outs)
        logits = self.classify(outs)
        return logits

### Train Model

In [None]:
dataloader = DataLoader(
    list(zip(y, X)),
    batch_size=batch_size,
    shuffle=True,
)

model = SimpleLM(vocab.__len__(), embedding_dim).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

for epoch in range(epochs):
    for labels, seqs in dataloader:
        # optimize
        optimizer.zero_grad()
        logits = model(seqs.to(device))
        loss = F.cross_entropy(logits, labels.to(device))
        loss.backward()
        optimizer.step()
        # calculate accuracy
        pred_labels = logits.argmax(dim=1)
        num_correct = (pred_labels == labels.to(device)).float().sum()
        accuracy = num_correct / len(labels)
        print("Epoch {} - loss: {:2.4f} - accuracy: {:2.4f}".format(epoch+1, loss.item(), accuracy), end="\r")
    print("")

### Text Generation

In [None]:
start_index = stoi["<start>"]
end_index = stoi["<end>"]
max_output = 128

def pred_output(sentence, progressive_output=True):
    test_seq = vocab(tokenizer.tokenize(sentence))
    test_seq.insert(0, start_index)
    for loop in range(max_output):
        input_tensor = torch.tensor([test_seq[-5:]], dtype=torch.int64).to(device)
        pred_logits = model(input_tensor)
        pred_index = pred_logits.argmax()
        test_seq.append(pred_index.item())
        if progressive_output:
            for i in test_seq:
                print(itos[i], end=" ")
            print("\n")
        if pred_index.item() == end_index:
            break
    return test_seq

In [None]:
_ = pred_output("in the united states president", progressive_output=True)
_ = pred_output("the man has accused by", progressive_output=True)
_ = pred_output("now he was expected to", progressive_output=True)

## Re-train Tokenizer

### Tokenization

In [None]:
training_corpus = (
    train_data[i : i + 1000]
    for i in range(0, len(train_data), 1000)
)

# create tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Train tokenizer
tokenizer = tokenizer.train_new_from_iterator(training_corpus, 52000)

# define tokenization function
def yield_tokens(data):
    for text in data:
        tokens = tokenizer.tokenize(text)
        yield tokens

# build vocabulary list
vocab = Vocab(
    train_data,
    tokenization=yield_tokens,
    special_token=["<unk>"],
    max_tokens=max_word,
)

# get list for index-to-word, and word-to-index.
itos = vocab.get_itos()
stoi = vocab.get_stoi()

### Train-Test Split

In [None]:
seq_len = 5 + 1
input_seq = []
for s in train_data:
    token_list = vocab(tokenizer.tokenize(s))
    for i in range(seq_len, len(token_list) + 1):
        seq_list = token_list[i-seq_len:i]
        input_seq.append(seq_list)
print("The number of training input sequence :{}".format(len(input_seq)))
input_seq = np.array(input_seq)

In [None]:
X, y = input_seq[:,:-1], input_seq[:,-1]

### Train Model

In [None]:
dataloader = DataLoader(
    list(zip(y, X)),
    batch_size=batch_size,
    shuffle=True,
)

model = SimpleLM(vocab.__len__(), embedding_dim).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

for epoch in range(epochs):
    for labels, seqs in dataloader:
        # optimize
        optimizer.zero_grad()
        logits = model(seqs.to(device))
        loss = F.cross_entropy(logits, labels.to(device))
        loss.backward()
        optimizer.step()
        # calculate accuracy
        pred_labels = logits.argmax(dim=1)
        num_correct = (pred_labels == labels.to(device)).float().sum()
        accuracy = num_correct / len(labels)
        print("Epoch {} - loss: {:2.4f} - accuracy: {:2.4f}".format(epoch+1, loss.item(), accuracy), end="\r")
    print("")

### Text Generation

In [None]:
_ = pred_output("in the united states president", progressive_output=True)
_ = pred_output("the man has accused by", progressive_output=True)
_ = pred_output("now he was expected to", progressive_output=True)