# Memory on this machine

In [1]:
import psutil
import subprocess

# Get the total RAM in bytes
total_ram = psutil.virtual_memory().total

# Convert bytes to GB
total_ram_gb = total_ram / (1024 ** 3)

print(f"Total RAM:\n{total_ram_gb:.2f} GB")

# Run nvidia-smi and capture the output
result = subprocess.run(['nvidia-smi', '--query-gpu=memory.total', '--format=csv'], stdout=subprocess.PIPE)
output = result.stdout.decode('utf-8')

# Print the output
print("GPU Memory Capacity:")
print(output)

Total RAM:
62.25 GB
GPU Memory Capacity:
memory.total [MiB]
16384 MiB



# Tuning GPT2

In [2]:
!pip install transformers scikit-learn pandas nltk datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import nltk
import pandas as pd
import re
import os
import torch

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline
)

In [4]:
from pprint import pprint
dataset = load_dataset("rahular/simple-wikipedia")

pprint(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 769764
    })
})


In [5]:
from pprint import pprint

split_dataset = dataset['train'].train_test_split(test_size=0.2)

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

pprint(train_dataset)
pprint(test_dataset)

print('Train dataset length: ' + str(len(train_dataset)))
print('Test dataset length: '+ str(len(test_dataset)))

Dataset({
    features: ['text'],
    num_rows: 615811
})
Dataset({
    features: ['text'],
    num_rows: 153953
})
Train dataset length: 615811
Test dataset length: 153953


In [6]:
# Defining the name of the model. E. g.: 'distilgpt2', 'gpt2' or 'gpt2-large'
model_name = 'gpt2'

# Model output directory
model_output_path_name='./models/gpt2-xl-wikipedia-simple'

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token



In [8]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

In [9]:
def tokenize_function(dataset):
    return tokenizer(dataset['text'], truncation=True, padding=True)

# Tokenize the train and test datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/615811 [00:00<?, ? examples/s]

Map:   0%|          | 0/153953 [00:00<?, ? examples/s]

## Training

In [10]:
# # Set device to CPU explicitly
# device = torch.device("cpu")

# # Move the model to CPU
# model.to(device)

training_args = TrainingArguments(
    output_dir=model_output_path_name, # The output directory
    overwrite_output_dir=False,         # Overwrite the content of the output directory
    num_train_epochs=1,                # Number of training epochs
    per_device_train_batch_size=6,     # Batch size for training
    per_device_eval_batch_size=6,      # Batch size for evaluation
    eval_steps=1000,                    # Interval between evaluations
    save_steps=10000,                    # Interval between model saves
    warmup_steps=300,                  # Adjusted warmup steps
    #gradient_accumulation_steps=4,     # Accumulate gradients to simulate larger batch size
    fp16=False,                         # Enable mixed precision training
    use_cpu=False                       # Explicitly enable/disable GPU usage
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

trainer.train(resume_from_checkpoint=False)
trainer.save_model()
trainer.save_state()

Step,Training Loss
500,3.6265
1000,3.5179
1500,3.4803
2000,3.4598
2500,3.4567
3000,3.4541
3500,3.4232
4000,3.3908
4500,3.4055
5000,3.4209


## Evaluate
We will now test  the trained model by creating a pipeline and entering a prompt.

In [14]:
text_generation_pipeline = pipeline(
    'text-generation',
    max_length=50,
    model=model_output_path_name,
    tokenizer=model_name,
    truncation=False,
    pad_token_id=50256,  # eos_token_id ensures a closed end
    device=0
)

# generate text based on the prompt
result = text_generation_pipeline(
    'Quarks are',
)[0]['generated_text']

# and display the result
print(result)

Quarks are also made in the other way of measuring. Quarks are made easily, since they are only formed when air is turned. They are not made by an atom. When a particle is not made, it is called quark. It


In [12]:
# # Initialize a question-answering pipeline
# qa_pipeline = pipeline(
#     'question-answering',
#     model=model_output_path_name,
#     tokenizer=model_name,
#     device=-1  # Use GPU if available
# )

# # Define the question and context
# question = "How did Arjuna defeat Karna?"
# context = "I will retell you the story, how Arjuna stain that foremost of warriors Karna."

# # Perform question-answering
# result = qa_pipeline(
#     {
#         'question': question,
#         'context': context
#     }
# )

# # Display the result
# print(result['answer'])

In [13]:
# # Load the summarization pipeline
# summarization_pipeline = pipeline(
#     'summarization',
#     model=model_output_path_name,
#     tokenizer=model_name,
#     device=-1
# )

# # Define the text (context) to be summarized
# context = """
# Arjuna, one of the Pandavas, was a peerless archer. In the Kurukshetra war, he faced many formidable opponents, one of them being Karna, the great warrior on the Kaurava side. After a long and intense battle, Arjuna finally slew Karna, ending the life of one of the foremost warriors of the time. This event was one of the key moments in the Mahabharata, symbolizing the victory of the Pandavas over the Kauravas. Arjuna's skill with the bow, his determination, and the guidance of Lord Krishna were crucial in this victory.
# """

# # Summarize the text using the pipeline
# result = summarization_pipeline(context, min_length=30, max_length=150)

# # Display the result
# print(f"Summary: {result[0]['summary_text']}")