In [1]:
import torch
import torch.nn as nn
from torchtext.data import Field, TabularDataset, BucketIterator
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.optim import AdamW

from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, get_linear_schedule_with_warmup

import os, csv
import pandas as pd

from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder

#model hyperparameters
device = torch.device('cuda:2')
learning_rate = 1e-4
eps = 1e-8
warmup_steps = 50

max_len = 512

batch_size = 5
epochs = 40

2023-11-01 21:44:11.124769: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
path1 = "archive/topics"
path2 = "archive/forms"

def remove_ending(text):
    lines = text.split('\n')
    filtered_lines = [line for line in lines if not line.strip().startswith(("Copyright", "©", "copyright"))]
    cleaned_text = '\n'.join(filtered_lines)
    return cleaned_text

def find_topics_forms(path, name):
    list_of_dict_of_poems = []
    for root, _, files in os.walk(path):
        for text in files:
            fullpath = root + "/" + text
            topic_or_form = os.path.basename(root)
            with open(fullpath, "r") as myfile:
                poem = myfile.read()
                poem = remove_ending(poem)
                #poem = poem.replace("\n", " <newline> ")
                title = text
                dictionary = {"title" : title , "poem" : poem, name : topic_or_form}
                list_of_dict_of_poems.append(dictionary)
    
    return list_of_dict_of_poems

poem_topic_dictionary = find_topics_forms(path1, "topic")
poem_form_dictionary = find_topics_forms(path2, "form")
print(len(poem_form_dictionary))

6322


In [3]:
def create_data_table(dictionary, name):
    column_names=["title","poem", name]
    
    df = pd.DataFrame(dictionary, columns=column_names)
    filepath = name + "s_dataset.csv"
    file = df.to_csv(filepath, index=False)
    read_file = pd.read_csv(filepath)

def compare_tables(forms, topics):
    forms_f = pd.read_csv(forms)
    topics_f = pd.read_csv(topics)
    poems_in_forms = forms_f['poem'].tolist()
    poems_in_topics = topics_f['poem'].tolist()
    for poem in poems_in_forms:
        if poem in topics_f['poem'].values:
            topic_value = topics_f[topics_f['poem'] == poem]['topic'].values[0]
            forms_f.loc[forms_f['poem'] == poem, 'topic'] = topic_value
    poems_to_add = [poem for poem in poems_in_topics if poem not in poems_in_forms]
    rows_to_add = topics_f[topics_f['poem'].isin(poems_to_add)]
    merged_df = pd.concat([forms_f, rows_to_add], ignore_index=True)
    merged_df.to_csv('merged_dataset.csv', index=False)

create_data_table(poem_form_dictionary, "form")
create_data_table(poem_topic_dictionary, "topic")
compare_tables('forms_dataset.csv', 'topics_dataset.csv')

In [4]:
df = pd.read_csv("merged_dataset.csv")
df

Unnamed: 0,title,poem,form,topic
0,AbcPoems2AbcHkAndChinaV2Cauchy3Poembycheungshu...,2 ABC of H.k. and China revised vision.\nBarre...,abc,
1,AbcPoems887LiveWithLoveAnAbcPoemPoembyMelvinaG...,"Apparently life without love, is no life at al...",abc,
2,AbcPoemsAAbcAnglesOnAngelsPoemByCauchy3Poembyc...,A abc angles on angels flaws (poem)\nMix with ...,abc,
3,AbcPoemsAAbcBrazilDancePoemByCauchy3Poembycheu...,A abc Brazil dance (poem)\nJack of crack in po...,abc,
4,AbcPoemsAbc123PoembyGabriellaFranco.txt,ABC... I can't go on\n123... what's the next o...,abc,
...,...,...,...,...
8133,ChildrenPoemsMyImaginaryFriendChildrenPoembyCJ...,"My friend, Sherla, was hard to see\nfor Mom an...",,children
8134,ChildrenPoemsMyShadowChildrenPoembyCJHeck.txt,I have a shadow hooked to me.\nSometimes he's ...,,children
8135,ChildrenPoemsMyToothChildrenPoembyCJHeck.txt,The toothfairy took my tooth lath night\nand l...,,children
8136,ChildrenPoemsNightNightChildrenPoembyCJHeck.txt,Night-night moon\nNight-night stars\nNight-nig...,,children


In [5]:
def encode_labels(label_set):
    label_encoder = preprocessing.LabelEncoder()
    input_classes = label_set
    label_encoder.fit(input_classes)
    int_labels = label_encoder.transform(input_classes)
    return int_labels

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
special_tokens_dict = {
    'bos_token': '<BOS>', 
    'eos_token': '<EOS>', 
    'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

In [18]:
class GPT2_Dataset(Dataset):
    def __init__(self, file, tokenizer, max_length = max_len):
        self.file = file
        self.tokenizer = tokenizer
        
        self.data = pd.read_csv(file)
        self.poems = [poem for poem in self.data['poem'].tolist() if len(poem) <= 300]
        self.labels = self.data['topic'] if 'topic' in self.data else None

        self.input_ids = []
        self.attention_masks = []
        for poem in self.poems:
            encodings_dict = tokenizer('<BOS>' + poem + '<EOS>',
                                         truncation=True,
                                         max_length=max_length,
                                         padding='max_length')
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attention_masks.append(torch.tensor(encodings_dict['attention_mask']))

        self.int_labels = encode_labels(self.labels)
        
    def __len__(self):
         return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.int_labels[idx]

In [19]:
topics_dataset = GPT2_Dataset("topics_dataset.csv", tokenizer=tokenizer)
dataloader = DataLoader(topics_dataset, sampler=RandomSampler(topics_dataset), batch_size=batch_size)

In [20]:
print('pad_index =', tokenizer.pad_token_id)
for i, batch in enumerate(dataloader):
    input_ids = batch[0]
    attention_mask = batch[1]
    labels = batch[2]
    print(batch)
    break

pad_index = 50259
[tensor([[50257,  3629,   521,  ..., 50259, 50259, 50259],
        [50257,    67,  2265,  ..., 50259, 50259, 50259],
        [50257, 26807,   262,  ..., 50259, 50259, 50259],
        [50257,    50,  9417,  ..., 50259, 50259, 50259],
        [50257, 10725,  5357,  ..., 50259, 50259, 50259]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 2, 2, 2, 0])]


In [21]:
#for poem in topics_dataset.poems:
#   print(len(poem))
len(topics_dataset)

295

In [22]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)

In [23]:
configuration = GPT2Config(vocab_size=len(tokenizer)).from_pretrained('gpt2', output_hidden_states=True)
poem_model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
poem_model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [24]:
optimizer = AdamW(poem_model.parameters(), lr=learning_rate, eps=eps)

In [25]:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

In [26]:
warmup_steps = 50
total_steps = len(dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)

In [27]:
#before training
raw_model =  GPT2LMHeadModel.from_pretrained('gpt2')
prompt = "butterfly"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs = raw_model.generate(input_ids, do_sample=True, max_length=30)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['butterfly-tape-laser", "ItemImage467" : "/wcsstore//wcsstore/null/Set-F15662-10323']

In [28]:
model = poem_model.to(device)

#model = nn.DataParallel(model)

for epoch in range(epochs):
    print(f'Epoch {epoch + 1} of {epochs}')
    total_loss = 0
    model.train()
    for i, batch in enumerate(dataloader): 
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        model.zero_grad()   
        
        outputs = model(input_ids, 
                        attention_mask=attention_mask,
                        token_type_ids=None)
        #print(outputs[0])
        loss = outputs[0]
        mean_loss = torch.mean(loss)
        #print(f"mean loss: {mean_loss}")
        batch_loss = loss.detach().cpu().numpy()
        total_loss += batch_loss
        
        loss.sum().backward()
        optimizer.step()
        scheduler.step()
        
        optimizer.zero_grad()

Epoch 1 of 40
Epoch 2 of 40
Epoch 3 of 40
Epoch 4 of 40
Epoch 5 of 40
Epoch 6 of 40
Epoch 7 of 40
Epoch 8 of 40
Epoch 9 of 40
Epoch 10 of 40
Epoch 11 of 40
Epoch 12 of 40
Epoch 13 of 40
Epoch 14 of 40
Epoch 15 of 40
Epoch 16 of 40
Epoch 17 of 40
Epoch 18 of 40
Epoch 19 of 40
Epoch 20 of 40
Epoch 21 of 40
Epoch 22 of 40
Epoch 23 of 40
Epoch 24 of 40
Epoch 25 of 40
Epoch 26 of 40
Epoch 27 of 40
Epoch 28 of 40
Epoch 29 of 40
Epoch 30 of 40
Epoch 31 of 40
Epoch 32 of 40
Epoch 33 of 40
Epoch 34 of 40
Epoch 35 of 40
Epoch 36 of 40
Epoch 37 of 40
Epoch 38 of 40
Epoch 39 of 40
Epoch 40 of 40


In [29]:
#after fine-tuning
model.eval()
prompt = "butterfly"

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(input_ids.to(device), do_sample=True, max_length=30)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['butterfly']