IMPORTING LIBRARIES

In [1]:
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
import matplotlib.pyplot as plt
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


CODE TO CHECK IF THE GPU IS DETECTED

In [2]:
import torch
torch.cuda.empty_cache()

assert torch.cuda.is_available()

#Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: NVIDIA A100-SXM-80GB, n_gpu: 4


LOAD THE DATASET

In [3]:
#how to train the model without a test set.
momentum = load_dataset('json', data_files='MPM.json')
#momentum = load_dataset('json', data_files='MPM_conditioned.json', split = 'train')
#momentum = momentum.train_test_split(test_size=0.2)
flattened_mpm = momentum.flatten()
example = flattened_mpm["train"][0]
example

Found cached dataset json (/home/kkonatha/.cache/huggingface/datasets/json/default-688fdf028d397096/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
100%|██████████| 1/1 [00:00<00:00, 513.25it/s]


{'URL': 'https://accure.ai/docs/momentum-user-guide/getting-started-with-momentum/1-getting-started-with-momentum/',
 'Product_Title': 'Momentum',
 'Section_Title': 'Getting Started with Momentum',
 'Section_Num': 1,
 'Article_Title': 'Accessing Momentum',
 'Article_Num': 1,
 'Article_Body': 'Momentum is a web-based system that is accessible via a web browser. To launch Momentum, point your browser address to: \nhttp://<public-ip-or-domain>:8800/mv-admin \nIf you installed Momentum from AWS marketplace, the default port to access Momentum is 8800. \nThe above URL will launch the login page.'}

TOKENIZE THE DATA AND ADD THE PAD TOKENS TO THE DATA

In [4]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

PREPROCESS THE DATA WITH THE REQUIRED FIELDS

In [5]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["Article_Body"]], truncation=True)

MAP THE TOKENIZED DATA

In [6]:
tokenized_mpm = flattened_mpm.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=momentum["train"].column_names,
)

Loading cached processed dataset at /home/kkonatha/.cache/huggingface/datasets/json/default-688fdf028d397096/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-29a5e2ba64eee4e9_*_of_00004.arrow


ENCODE THE DATA WITH THE FIXED BLOCK SIZE

In [7]:
block_size = 60

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

PREPROCESSED DATA FOR FINE TUNING

In [8]:
mpm_dataset = tokenized_mpm.map(group_texts, batched=True, num_proc=4)

Loading cached processed dataset at /home/kkonatha/.cache/huggingface/datasets/json/default-688fdf028d397096/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-492df3d1432d8d5e_*_of_00004.arrow


In [9]:
mpm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1025
    })
})

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
import torch

mpm_model = AutoModelForCausalLM.from_pretrained("gpt2")

In [12]:
import torch

# Get the underlying PyTorch model
model = mpm_model.transformer

# Compute the expected sequence length from the model configuration
seq_length = mpm_model.config.n_positions

# Compute the expected input shape
input_shape = (seq_length, None)

print("Input shape:", input_shape)

Input shape: (1024, None)


TRAINING THE DATA

In [None]:
import matplotlib.pyplot as plt
import numpy as np

training_args = TrainingArguments(
    output_dir="gpt_model",
    overwrite_output_dir=True,
    learning_rate=7e-5,
    weight_decay=0.01,
    num_train_epochs=350,
    logging_steps=50,
    save_total_limit=2,
    per_device_train_batch_size=3,
    #gradient_accumulation_steps=4,
    save_steps=10_000,
    evaluation_strategy='no',
    #load_best_model_at_end=True
)

trainer = Trainer(
    model=mpm_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=mpm_dataset['train'],
)

train_output = trainer.train()

Step,Training Loss
50,1.6153
100,1.4625
150,1.4434
200,1.3481
250,1.3109
300,1.2358
350,1.1842
400,1.1303


In [None]:
train_output

SAVE THE MODEL

In [None]:
mpm_model.save_pretrained('/scratch/kkonatha/test8') #then need to load it to generate a test

PROMPT GENERATION

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained("/scratch/kkonatha/test8")

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    
prompt = ("Momentum provides SQL-based interface for data transformation"
          " Data cleaning, null removal, datatype conversion, column renaming, mathematical transformation, blending, merging, joining with multiple data sources are some of the transformation tasks that can be performed over data created within Momentum")

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
attention_mask = torch.ones_like(input_ids)

gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    attention_mask=attention_mask,
    num_return_sequences=1,
    #return_dict_in_generate=True,
    temperature=0.3,
    max_length=500,
)

gen_text = tokenizer.batch_decode(gen_tokens)[0]
print(gen_text)

VISUALIZATIONS

In [None]:
train_loss_values = train_output.loss_history["training_loss"]
plt.plot(train_loss_values, label="Training loss")
plt.xlabel("Training steps")
plt.ylabel("Loss")
plt.legend()
plt.ylim(0, 3) # Adjust the y-axis limits as needed
plt.show()

In [None]:
#LOSS CURVES

train_output = trainer.train()
train_loss_values = train_output.metrics['train_loss']
plt.plot(train_loss_values, label="Training loss")
plt.xlabel("Training steps")
plt.ylabel("Loss")
plt.legend()
plt.savefig("training_loss.png")
plt.show()

In [None]:
train_loss_values

In [None]:
#PERPLEXITY SCORE AND CURVE
train_loss = trainer.evaluate(mpm_dataset['train'])['eval_loss']
train_perplexity = torch.exp(torch.tensor(train_loss))

print("Train perplexity:", train_perplexity.item())

plt.plot(train_ppl, label='Training Perplexity')
plt.plot(val_ppl, label='Validation Perplexity')
plt.legend()
plt.show()

In [None]:
#ATTENSION MAP
from transformers import AutoTokenizer, AutoModel
from bertviz import head_view

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained("/scratch/kkonatha/test8")

# Define input text
input_text = 'Momentum is a web-based system that is accessible via a web browser'

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Get model output and attention weights
output = model(input_ids)
attention = output['attentions']

# Visualize attention map for a specific layer and head
layer_id = 0  # layer id
head_id = 0   # head id
attention_map = attention[layer_id][0][head_id].detach().numpy()
head_view(attention_map, input_tokens=tokenizer.convert_ids_to_tokens(input_ids[0]))