<a href="https://colab.research.google.com/github/tiro2000/Customs/blob/main/Customs_Guide_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title 1. Keep this tab alive to prevent Colab from disconnecting you { display-mode: "form" }

#@markdown Press play on the music player that will appear below:
%%html
<audio src="https://oobabooga.github.io/silence.m4a" controls>

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!pip  install --upgrade transformers
!pip install datasets
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2
Looking in i

In [7]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token


counter = 0

def tokenize_function(samples):
    global counter
    output = tokenizer(samples["text"], padding='max_length', truncation=True, max_length=1024) # You can change max_length based on your requirement
    if counter < 5:
        print(f"Input length: {len(output['input_ids'])}") 
        counter += 1
    return output

## Load your data
with open('Customs.txt', 'r') as file:
    data = file.read().split("<END>")
# Add the '<END>' token back after splitting
data = [text.strip() + "<END>" for text in data if text.strip() != ""]
# Split the data into a train set and a temporary set (20% of the data)
train_data, temp_data = train_test_split(data, test_size=0.2)

# Split the temporary set into a validation set and a test set
val_data, test_data = train_test_split(temp_data, test_size=0.5)

# Convert the lists to 'Dataset' objects
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data, columns=["text"]))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_data, columns=["text"]))
test_dataset = Dataset.from_pandas(pd.DataFrame(test_data, columns=["text"]))

# Tokenize the data
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Input length: 77


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Input length: 10


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Input length: 10


In [8]:
from google.colab import drive
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer
from transformers import DataCollatorForLanguageModeling, TrainerCallback
from transformers import AdamW,Trainer

drive.mount('/content/gdrive')

model = GPT2LMHeadModel.from_pretrained("distilgpt2")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=3):
        self.patience = patience
        self.best_loss = None
        self.check = False
        self.stopped_epoch = 0
        self.wait = 0

    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        if logs is not None and "eval_loss" in logs:
            if self.best_loss is None:
                self.best_loss = logs["eval_loss"]
            elif self.best_loss is not None:
                if logs["eval_loss"] < self.best_loss:
                    self.best_loss = logs["eval_loss"]
                    self.wait = 0
                else:
                    self.wait += 1
                    if self.wait >= self.patience:
                        self.stopped_epoch = state.epoch
                        control.should_training_stop = True

    def on_train_end(self, args, state, control, **kwargs):
        if state.epoch != self.stopped_epoch:
            print(f'Early stopping occurred at epoch {self.stopped_epoch}')

class CustomTrainer(Trainer):
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        named_parameters = list(self.model.named_parameters())

        no_decay = ['bias', 'LayerNorm.weight']

        optimizer_grouped_parameters = [
            {'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        self.optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
        self.lr_scheduler = self.create_scheduler(num_training_steps)

from transformers import AdamW

# Define the optimizer with weight decay
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.03)

callbacks = [EarlyStoppingCallback(patience=3)]

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_clipping=1.0,
    warmup_steps=100,
    max_steps=800,
    num_train_epochs=30,  # increased for your small dataset
    learning_rate=5e-5,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,  # evaluate every 100 steps
    evaluation_strategy="steps",  # evaluate during training
    output_dir="/content/gdrive/MyDrive/ModelCheckpoints/",
    save_total_limit=1,  # only keep the latest checkpoint
)

# Increase dropout rates
model.config.dropout = 0.2
model.config.attn_pdrop = 0.2

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    callbacks=callbacks,
    optimizers=(optimizer, None)  # None as we're not using a learning rate scheduler
)
trainer.train()
trainer.evaluate(test_dataset)
model_save_path = "/content/gdrive/MyDrive/my_model"
# Save the model
trainer.save_model(model_save_path)
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).




Step,Training Loss,Validation Loss
100,2.3483,2.518447
200,1.8575,2.264706
300,1.8065,2.147223
400,1.423,2.098801
500,1.4772,2.070011
600,1.5412,2.052093
700,1.1813,2.044354
800,1.2855,2.045373


Early stopping occurred at epoch 0


('/content/gdrive/MyDrive/my_model/tokenizer_config.json',
 '/content/gdrive/MyDrive/my_model/special_tokens_map.json',
 '/content/gdrive/MyDrive/my_model/vocab.json',
 '/content/gdrive/MyDrive/my_model/merges.txt',
 '/content/gdrive/MyDrive/my_model/added_tokens.json')

In [6]:
model_save_path = "/content/gdrive/MyDrive/my_model"
# Save the model
trainer.save_model(model_save_path)
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/gdrive/MyDrive/my_model/tokenizer_config.json',
 '/content/gdrive/MyDrive/my_model/special_tokens_map.json',
 '/content/gdrive/MyDrive/my_model/vocab.json',
 '/content/gdrive/MyDrive/my_model/merges.txt',
 '/content/gdrive/MyDrive/my_model/added_tokens.json')

In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = "/content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700"  # Change this path to the one where your model is saved.

model = GPT2LMHeadModel.from_pretrained("/content/gdrive/MyDrive/my_model")
tokenizer = GPT2Tokenizer.from_pretrained("/content/gdrive/MyDrive/my_model")

# Load the tokenizer and model
#tokenizer = GPT2Tokenizer.from_pretrained(model_path)
#model = GPT2LMHeadModel.from_pretrained(model_path)

# Test the model
# Adjust the max_length and temperature as needed
input_text = "What are requirements registering a business ?"  # Enter the beginning of your text here
input_ids = tokenizer.encode(input_text, return_tensors='pt')
# Generate 5 different sequences
output = model.generate(
    input_ids, 
    max_length=150, 
    num_return_sequences=2, 
    do_sample=True, 
    temperature=0.7
)


# Decode and print each sequence
for i, sequence in enumerate(output):
    decoded_output = tokenizer.decode(sequence, skip_special_tokens=True)
    decoded_output = decoded_output.replace("<END>", "")
    print(f"Generated Text {i+1}:")
    print(decoded_output)
    print()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text 1:
What are requirements registering a business?
The following table documents the requirements for a business that is registered as a Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted Restricted

Generated Text 2:
What are requirements registering a business?

If you have any questions, please email us at sarah.cotton@theguardian.com or ca

In [15]:
!zip -r /content/my_model.zip /content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700

  adding: content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700/ (stored 0%)
  adding: content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700/config.json (deflated 52%)
  adding: content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700/generation_config.json (deflated 24%)
  adding: content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700/pytorch_model.bin (deflated 9%)
  adding: content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700/training_args.bin (deflated 49%)
  adding: content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700/optimizer.pt (deflated 7%)
  adding: content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700/scheduler.pt (deflated 48%)
  adding: content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700/scaler.pt (deflated 55%)
  adding: content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700/trainer_state.json (deflated 86%)
  adding: content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700/rng_state.pth (deflated 28%)


In [21]:
model_save_path = "/content/gdrive/MyDrive/my_model"
#model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/gdrive/MyDrive/my_model/tokenizer_config.json',
 '/content/gdrive/MyDrive/my_model/special_tokens_map.json',
 '/content/gdrive/MyDrive/my_model/vocab.json',
 '/content/gdrive/MyDrive/my_model/merges.txt',
 '/content/gdrive/MyDrive/my_model/added_tokens.json')

In [23]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('/content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700')
model = GPT2LMHeadModel.from_pretrained('/content/gdrive/MyDrive/ModelCheckpoints/checkpoint-700')