Reading and Preprocessing the Data

In [1]:
import pandas as pd

videos_df = pd.read_csv('videos-stats.csv')
comments_df = pd.read_csv('comments.csv')


In [2]:
merged_df = pd.merge(videos_df, comments_df, on='Video ID')

In [3]:
import re

merged_df['Comment'] = merged_df['Comment'].astype(str).str.replace(r'<.*?>', '', regex=True)
merged_df['Comment'] = merged_df['Comment'].astype(str).str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
merged_df['Comment'] = merged_df['Comment'].astype(str).str.lower()

merged_df = merged_df.drop_duplicates(subset=['Comment'])
merged_df = merged_df[merged_df['Comment'].astype(str).str.len() > 5]

# Taking 100 comments for training because the model's training phase is too long
merged_df = merged_df.iloc[:100]

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=5000, min_df=5, max_df=0.7)


X = vectorizer.fit_transform(merged_df['Comment'])


Generating an example output with pre-trained GPT-2

In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

input_text = "I hope that"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I hope that you will join us in our efforts to bring about a new era of peace and prosperity for all of us."

The event was held at the University of California, Berkeley, where the event was held.

The event was


In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2LMHeadModel.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '[PAD]'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))  


# Split into train and validation sets
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)

# Convert texts into numbers with tokenizer
train_encodings = tokenizer(train_df['Comment'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df['Comment'].tolist(), truncation=True, padding=True)


In [13]:
import torch

class CommentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the labels (same as input_ids for language modeling)
        item['labels'] = item['input_ids']
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


train_dataset = CommentDataset(train_encodings)
val_dataset = CommentDataset(val_encodings)


In [14]:
!pip install transformers[torch]



In [15]:
!pip install accelerate -U



Training the GPT-2 with 100 comments

In [17]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DefaultFlowCallback, TrainerCallback

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[DefaultFlowCallback], 
)


trainer.train()

You are adding a <class 'transformers.trainer_callback.DefaultFlowCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback


Epoch,Training Loss,Validation Loss
1,No log,9.524786
2,No log,4.40402
3,No log,3.596179




TrainOutput(global_step=30, training_loss=7.278282674153646, metrics={'train_runtime': 1915.2687, 'train_samples_per_second': 0.125, 'train_steps_per_second': 0.016, 'total_flos': 23026360320000.0, 'train_loss': 7.278282674153646, 'epoch': 3.0})

Load the trained model

In [25]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer.save_pretrained("results/checkpoint-30")

tokenizer = GPT2Tokenizer.from_pretrained(r"results/checkpoint-30")
model = GPT2LMHeadModel.from_pretrained(r"results/checkpoint-30")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generate meaningful and well learned(?) outputs from trained model

In [26]:

input_text = "I hope that"

input_ids = tokenizer.encode(input_text, return_tensors='pt')

output_sequences = model.generate(
    input_ids=input_ids,
    max_length=100, 
    num_return_sequences=3,
    no_repeat_ngram_size=2,
    temperature=0.7, 
    top_k=50,
    top_p=0.95, 
    do_sample=True, 
)

for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    print("Generated sequence", generated_sequence_idx + 1, ":")
    generated_text = tokenizer.decode(generated_sequence, skip_special_tokens=True)
    print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated sequence 1 :
I hope that
 the the a the an a
 the
 athe

the aa
The a an
A aThe
IThea a TheaTheA
THE_S
TH
N
R
M
S,M
Generated sequence 2 :
I hope that the thethethe the, theThesons of thes,The the Theson theIth.
Generated sequence 3 :
I hope that $s.

4. The sine of the, the sinity ofthe,
 sini-
sineofthe.


Failure :(