# Installing Dependencies

In [23]:
!pip install transformers datasets torch accelerate pandas



# Import Libraries

In [17]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_dataset

# Import Model

In [18]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Importing and Processing Dataset

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set the pad_token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Load your dataset in a CSV file
dataset = load_dataset("csv", data_files="data.csv")

import pandas as pd
data = pd.read_csv('data.csv')
print(data.head())

# Split the dataset into training and validation sets
train_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 != 0])  # Use 90% of the data for training
val_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 == 0])  # Use 10% of the data for validation

# Tokenize the input and target sequences
def tokenize_function(examples):
    inputs = tokenizer(examples['Bad_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    labels = tokenizer(examples['Good_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    return {'input_ids': inputs['input_ids'], 'labels': labels['input_ids']}

# Apply tokenization to the datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

                     Bad_Practices                      Good_Practices
0  <table alt=header>Title</table>   <table alt='header'>Title</table>
1                      <tr>Content                    <tr>Content</tr>
2    <h2 src='description'>Content  <h2 src='description'>Content</h2>
3                      <table>Link                 <table>Link</table>
4          <img src='description'>      <img src='description' alt=''>


# Train Model

In [24]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./model',
    overwrite_output_dir=True,
    num_train_epochs=0.5,
    per_device_train_batch_size=2,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=500,
    logging_steps=100,
    logging_dir='./logs',
    report_to=None
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

# Fine-tune the model
trainer.train()

Step,Training Loss,Validation Loss
100,0.0199,0.019064
200,0.0196,0.017813
300,0.018,0.01817
400,0.0186,0.017858
500,0.0188,0.017333
600,0.0182,0.017137
700,0.0176,0.016685
800,0.0179,0.01764
900,0.0182,0.016867
1000,0.0177,0.016806


TrainOutput(global_step=1510, training_loss=0.01792855885447256, metrics={'train_runtime': 341.7734, 'train_samples_per_second': 8.836, 'train_steps_per_second': 4.418, 'total_flos': 789101936640000.0, 'train_loss': 0.01792855885447256, 'epoch': 0.5})

# Save Model

In [25]:
trainer.save_model()

# Running Inference

In [32]:
# Inference Example
example_input = "<h1>Heading"
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, 
                            max_length=512, 
                            num_return_sequences=1, 
                            top_k=50, 
                            top_p=0.95, 
                            attention_mask=attention_mask,
                            do_sample=True,)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original HTML code: <h1>Heading
Corrected HTML code: <h1>Heading style</h1>


In [36]:
# Inference Example
example_input = "</table>"
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, 
                            max_length=512, 
                            num_return_sequences=1, 
                            top_k=50, 
                            top_p=0.95, 
                            attention_mask=attention_mask,
                            do_sample=True,)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original HTML code: </table>
Corrected HTML code: </table>Hello</table>
