In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from transformers import GPT2Tokenizer,TFGPT2LMHeadModel
from transformers import TFTrainer,TFTrainingArguments
import wandb
from datasets import load_metric

ModuleNotFoundError: No module named 'transformers'

In [None]:
#loading csv containing all features
data=pd.read_csv('Dataset.csv')
data.head()

In [None]:
#removing brackets from description feature and dropping missing values and duplicates from data
data['Description']=data['Description'].str.replace('(','').str.replace(')','')
data=data.dropna()
data=data.drop_duplicates()

In [None]:
#adding <EOS> end of sentence tag so that the model knows where to stop generating text
overview=[]
for i in data.Description:
    overview.append(i + '<EOS>')

In [None]:
#splitting the dataset into 80% of train and 20% of test dataset
split=int(0.8 * len(overview))
train_data=overview[:split]
test_data=overview[split:]

In [None]:
#using the tokenizer from GPT2 to generate word encodings
tokenizer = GPT2Tokenizer.from_pretrained("gpt2",eos_token='<EOS>',pad_token='<P>',use_fast=True)

In [None]:
"""
encoding test and train data(converting words into tokens or ids which contain input_ids(matrix of id assigned to each word)
and attention_mask(matrix of boolean values where 1 means the model will focus on that word and 0 means it will not focus on it)
"""
train_encodings=tokenizer(train_data,padding=True,truncation=True)
test_encodings=tokenizer(test_data,padding=True,truncation=True)

In [None]:
#creating labels (data to predict) so that text predicted can be generated for supervised learning
def create_labels(inputs):
    labels=[]
    for ids,attention_mask in zip(inputs['input_ids'],inputs['attention_mask']):
        label=ids.copy()
        real_len=sum(attention_mask)
        padding_len=len(attention_mask)-sum(attention_mask)
        label[:]=label[:real_len]+[-100]*padding_len
        labels.append(label)
    inputs['labels']=labels
#applying the function to train and test encodings
create_labels(train_encodings)
create_labels(test_encodings)

In [None]:
#converting train and test dataset into tensorflow dataset slices to fit transformer in tensorflow
train_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(train_encodings),train_encodings['labels']))

test_dataset = tf.data.Dataset.from_tensor_slices(
   (dict(test_encodings),test_encodings['labels']))

In [None]:
#setting training arguments for model training
training_args=TFTrainingArguments(report_to='wandb',
                                 output_dir='results',
                                 do_train=True,
                                 overwrite_output_dir=True,
                                 logging_strategy='epoch',
                                 num_train_epochs=2,
                                 per_device_train_batch_size=2,
                                 per_device_eval_batch_size=4,
                                 save_steps=-1,
                                 logging_steps=5000,
                                 metric_for_best_model = 'accuracy',
                                 logging_dir='./logs')

In [None]:
#initializing the GPT2 Transformer in tensorflow
with training_args.strategy.scope():
    model = TFGPT2LMHeadModel.from_pretrained('gpt2',from_pt=True,
                                              eos_token_id=tokenizer.eos_token_id,
                                              pad_token_id=tokenizer.pad_token_id)
    model.resize_token_embeddings(len(tokenizer))

In [None]:
#for logging results
transformers.logging.set_verbosity_info()

In [None]:
#initializing the trainer API for training GPT2
trainer=TFTrainer(model=model,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=test_dataset,
                  compute_metrics=compute_metric)
#initiating training
trainer.train()

In [None]:
#finishing the monitoring and outputting the metrics
wandb.finish()

In [None]:
#dir where trained model will be saved for text generation
output_dir = '/saved_model/'

In [None]:
# save model and model configs for future use
model.save_pretrained(output_dir)
# save tokenizer for future use
tokenizer.save_pretrained(output_dir,legacy_format=True)