# NMSU CSCI-5435 Assignment 5

## Relevant Information

In [2]:
#Name:               Tianjie Chen
#Email:              tvc5586@nmsu.edu
#File Creation Date: Apr/2/2025
#Purpose of File:    NMSU CSCI-5435 Assignment 5
#Last Edit Date:     Apr/3/2025
#Last Edit Note:     Modify text generation function
#GenAI used:         False

## Load Libraries

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed
set_seed(3407)

## Settings

In [5]:
# USING GPU
#print(torch.cuda.device_count())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
DATA_PATH = "news_summary.csv"

## Data Preprocessing

In [7]:
df = pd.read_csv(DATA_PATH)

In [8]:
print(df.columns)

Index(['headlines', 'text'], dtype='object')


In [9]:
# Get average length
result = pd.DataFrame([[]])
for col in df:
    result[col] = df[col].apply(len).mean()

print(result)

   headlines        text
0  57.427109  356.907643


In [10]:
# Concat text with headline
data = []

for headline, text in zip(df['headlines'], df['text']):
    data.append("<start> " + text + " <tl;dr> " + headline + " <end>")

In [11]:
# Separate train and test
test = data[:5]
data = data[5:]

In [12]:
print(data[0])

<start> Pakistani singer Rahat Fateh Ali Khan has denied receiving any notice from the Enforcement Directorate over allegedly smuggling foreign currency out of India. "It would have been better if the authorities would have served the notice first if any and then publicised this," reads a press release issued on behalf of Rahat. The statement further called the allegation "bizarre". <tl;dr> Rahat Fateh Ali Khan denies getting notice for smuggling currency <end>


## Vocab

In [13]:
from transformers import BertTokenizerFast

class Vocab:
    def __init__(self, list_of_sentence, tokenization, special_token, max_tokens=None):
        # count vocab frequency
        vocab_freq = {}
        tokens = tokenization(list_of_sentence)
        for t in tokens:
            for vocab in t:
                if vocab not in vocab_freq:
                    vocab_freq[vocab] = 0
                vocab_freq[vocab] += 1
        # sort by frequency
        vocab_freq = {k: v for k, v in sorted(vocab_freq.items(), key=lambda i: i[1], reverse=True)}
        # create vocab list
        self.vocabs = special_token + list(vocab_freq.keys())
        if max_tokens:
            self.vocabs = self.vocabs[:max_tokens]
        self.stoi = {v: i for i, v in enumerate(self.vocabs)}

    def _get_tokens(self, list_of_sentence):
        for sentence in list_of_sentence:
            tokens = tokenizer.tokenize(sentence)
            yield tokens

    def get_itos(self):
        return self.vocabs

    def get_stoi(self):
        return self.stoi

    def append_token(self, token):
        self.vocabs.append(token)
        self.stoi = {v: i for i, v in enumerate(self.vocabs)}

    def __call__(self, list_of_tokens):
        def get_token_index(token):
            if token in self.stoi:
                return self.stoi[token]
            else:
                return 0
        return [get_token_index(t) for t in list_of_tokens]

    def __len__(self):
        return len(self.vocabs)
        
###
# generate Vocab
###
max_word = 50000

# create tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Must manually add the start and end tokens, otherwise
# the tokenizer will separate them into three tokens
tokenizer.add_tokens(["<start>", "<end>", "<tl;dr>"])

# define tokenization function
def yield_tokens(data):
    for text in data:
        tokens = tokenizer.tokenize(text)
        yield tokens

# build vocabulary list
vocab = Vocab(
    data,
    tokenization=yield_tokens,
    special_token=["<unk>"],
    max_tokens=max_word,
)

# get list for index-to-word, and word-to-index.
itos = vocab.get_itos()
stoi = vocab.get_stoi()

# Add <pad> token
vocab.append_token("<pad>")

In [14]:
print(vocab(tokenizer.tokenize("<start> test test test test <tl;dr> ignore ignore <end>")))

[12, 796, 796, 796, 796, 13, 9226, 9226, 14]


## Create Dataset

In [15]:
import pickle

class TextDataset(Dataset):

    def __init__(self, data, vocab, length=1000):
        self.data   = data
        self.length = length
        self.vocab  = vocab
        self.hashes = set()
    
    def __len__(self):
        return len(self.data) # ...
    
    def get_vocab_size(self):
        return self.vocab.__len__()
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.length * 2 - 1

    def __getitem__(self, idx):

        while True:
            # get a random row
            indx = np.random.randint(len(self.data))
            
            # get hash of random number
            h = hash(pickle.dumps(indx))

            # check if the random row is already used
            if h not in self.hashes:
                self.hashes.add(h)
                break
        
        # tokenize to a list of word's indices
        tokens = vocab(tokenizer.tokenize(data[indx]))

        # separate into features and labels
        stop_token = vocab(tokenizer.tokenize("<end>"))[0]
        
        x = tokens
        y = tokens[1:]

        # limit length to max_seq_len
        x = x[:self.length]
        y = y[:self.length]
        
        # pad features and labels
        x += [self.vocab.__len__() - 1] * (self.length - len(x))
        y += [stop_token] * (self.length - len(y))

        # convert to tensor
        x = torch.tensor(x, dtype=torch.int64).to(device)
        y = torch.tensor(y, dtype=torch.int64).to(device)
        
        return x, y


In [16]:
# print an example instance of the dataset
train_dataset = TextDataset(data = data, vocab = vocab)

## Train Model

### Define Model & Trainer

In [17]:
# create a GPT instance
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'openai-gpt'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model = GPT(model_config)

number of parameters: 106.16M


In [18]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.batch_size = 4            # default is 64
train_config.learning_rate = 5e-5
train_config.max_iters = 10000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


### Train Model

In [19]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")

trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 10.29015
iter_dt 594.38ms; iter 100: train loss 0.77701
iter_dt 601.60ms; iter 200: train loss 0.69620
iter_dt 603.28ms; iter 300: train loss 0.68510
iter_dt 602.63ms; iter 400: train loss 0.69366
iter_dt 607.23ms; iter 500: train loss 0.59705
iter_dt 604.78ms; iter 600: train loss 0.72141
iter_dt 604.07ms; iter 700: train loss 0.72921
iter_dt 605.06ms; iter 800: train loss 0.69311
iter_dt 606.34ms; iter 900: train loss 0.66488
iter_dt 608.55ms; iter 1000: train loss 0.58155
iter_dt 606.07ms; iter 1100: train loss 0.60268
iter_dt 607.13ms; iter 1200: train loss 0.58757
iter_dt 606.14ms; iter 1300: train loss 0.54505
iter_dt 608.21ms; iter 1400: train loss 0.61567
iter_dt 608.73ms; iter 1500: train loss 0.59274
iter_dt 607.03ms; iter 1600: train loss 0.66253
iter_dt 605.34ms; iter 1700: train loss 0.56279
iter_dt 606.15ms; iter 1800: train loss 0.54145
iter_dt 603.77ms; iter 1900: train loss 0.58959
iter_dt 604.78ms; iter 2000: train loss 0.58182
iter_

## Text Generation

### Generation Function

In [20]:
tldr_index = stoi["<tl;dr>"]

def generate(vocab, tokenizer, prompt, num_samples=1, steps=50, do_sample=True):

    test_seq  = vocab(tokenizer.tokenize(prompt))
    input_seq = torch.tensor([test_seq[: test_seq.index(tldr_index) + 1]], dtype=torch.int64).to(device)
    
    # forward the model `steps` times to get samples, in a batch
    y = model.generate(input_seq, max_new_tokens=steps, do_sample=do_sample, top_k=10)

    output = ""
    
    for i in range(num_samples):
        for j in y[i]:
            output += itos[j] + " "
        output += "\n"

    output = output.replace(" ##", "")
    headline = output.split("<tl;dr>")[1].split("<end>", 1)[0]
    text = output.split("<tl;dr>")[0].split("<start>")[1]

    print(f"Text: {text}\n\nHeadline: {headline}")

### Generating Headlines

In [21]:
generate(vocab, tokenizer, prompt=test[0], num_samples=1, steps=50)

Text:  Saurav Kant , an alumnus of upGrad and IIIT - B ' s PG Program in Machine learning and Artificial Intelligence , was a Sr Systems Engineer at Infosys with almost 5 years of work experience . The program and upGrad ' s 360 - degree career support helped him transition to a Data Scientist at Tech Mahindra with 90 % salary hike . upGrad ' s Online Power Learning has powered 3 lakh + careers . 

Headline:  Manj Mahaan PNo ' scam ' luru ' s death , PNB 


In [22]:
generate(vocab, tokenizer, prompt=test[1], num_samples=1, steps=50)

Text:  Kunal Shah ' s credit card bill payment platform , CRED , gave users a chance to win free food from Swiggy for one year . Pranav Kaushik , a Delhi techie , bagged this reward after spending 2000 CRED coins . Users get one CRED coin per rupee of bill paid , which can be used to avail rewards from brands like Ixigo , BookMyShow , UberEats , Cult . Fit and more . 

Headline:  I - in Ideal Bank to invests I - in Uj ' s Uber 


In [23]:
generate(vocab, tokenizer, prompt=test[2], num_samples=1, steps=50)

Text:  New Zealand defeated India by 8 wickets in the fourth ODI at Hamilton on Thursday to win their first match of the five - match ODI series . India lost an international match under Rohit Sharma ' s captaincy after 12 consecutive victories dating back to March 2018 . The match witnessed India getting all out for 92 , their seventh lowest total in ODI cricket history . 

Headline:  India ' s top Test cricket team to score 100 - 1 win in ODI cricket ? 


In [24]:
generate(vocab, tokenizer, prompt=test[3], num_samples=1, steps=50)

Text:  With Aegon Life iTerm Insurance plan , customers can enjoy tax benefits on your premiums paid and save up to â¹46 , 800 ^ on taxes . The plan provides life cover up to the age of 100 years . Also , customers have options to insure against Critical Illnesses , Disability and Accidental Death Benefit Rider with a life cover up to the age of 80 years . 

Headline:  WhatsAppi , AccelsApp to be used to â¹3 , 600 crore 


In [25]:
generate(vocab, tokenizer, prompt=test[4], num_samples=1, steps=50)

Text:  Speaking about the sexual harassment allegations against Rajkumar Hirani , Sonam Kapoor said , " I ' ve known Hirani for many years . . . What if it ' s not true , the [ # MeToo ] movement will get derailed . " " In the # MeToo movement , I always believe a woman . But in this case , we need to reserve our judgment , " she added . Hirani has been accused by an assistant who worked in ' Sanju ' . 

Headline:  When I don ' t have been an event to play : Shahidu 
