In [None]:
pip install transformers[torch]

In [None]:
pip install transformers


In [None]:
!pip install accelerate

In [3]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [4]:
# Data Preprocessing and creating a text file for training
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df['description'] = df['description'].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True).str.lower().str.strip()
    df.fillna('Unknown', inplace=True)
    df['prompt'] = df['description']
    df['response'] = df['Book_title'] + " by " + df['author_name(s)']
    df['train_data'] = df['prompt'] + " <SEP> " + df['response']
    df['train_data'].to_csv('train_data.txt', header=False, index=False)
    return df

In [5]:
# Model Setup and Training
def train_model():
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path='train_data.txt',
        block_size=128
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False
    )

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()

    # Save model and tokenizer
    model.save_pretrained('./saved_model')
    tokenizer.save_pretrained('./saved_model')

    return tokenizer, model

In [12]:
# Load Model and Tokenizer
def load_model():
    tokenizer = GPT2Tokenizer.from_pretrained('./saved_model')
    model = GPT2LMHeadModel.from_pretrained('./saved_model')
    return tokenizer, model

In [21]:
# Recommendation Function
def get_recommendation(description, tokenizer, model):
    input_ids = tokenizer.encode(description, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=100, eos_token_id=50256)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [8]:
# Load data, train model, and prepare tokenizer and model for use
preprocess_data('/content/df_for_gpt2.csv')
tokenizer, model = train_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,4.1825
1000,3.6875
1500,3.5443
2000,3.4433
2500,3.3869
3000,3.365
3500,3.2775
4000,3.1611
4500,3.1497
5000,3.1213


Step,Training Loss
500,4.1825
1000,3.6875
1500,3.5443
2000,3.4433
2500,3.3869
3000,3.365
3500,3.2775
4000,3.1611
4500,3.1497
5000,3.1213


In [20]:
tokenizer, model = load_model()
description = "This book for learning python."
recommendation = get_recommendation(description, tokenizer, model)
print("Recommended Book:", recommendation)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Recommended Book: This book for learning python.com is a comprehensive guide to the language and its latest features and features that are available in python 3 and beyond this book is a comprehensive guide to the language and its latest features and features that are available in python 3 and beyond this book is a comprehensive guide to the language and its latest features and features that are available in python 3 and beyond this book is a comprehensive guide to the language and its latest features and features that are available in python 3 and beyond this book is


In [16]:
!zip -r gpt2_finetuned.zip /content/saved_model


  adding: content/saved_model/ (stored 0%)
  adding: content/saved_model/generation_config.json (deflated 24%)
  adding: content/saved_model/vocab.json (deflated 68%)
  adding: content/saved_model/merges.txt (deflated 53%)
  adding: content/saved_model/config.json (deflated 51%)
  adding: content/saved_model/tokenizer_config.json (deflated 54%)
  adding: content/saved_model/special_tokens_map.json (deflated 74%)
  adding: content/saved_model/model.safetensors (deflated 7%)


In [18]:
from google.colab import files

files.download('gpt2_finetuned.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>