In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.optim import AdamW

from torch.utils.data import Subset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, get_linear_schedule_with_warmup

import os, csv
import pandas as pd

from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder

#model hyperparameters
device = torch.device('cuda:2')
learning_rate = 1e-4
eps = 1e-8
warmup_steps = 50

max_len = 512

batch_size = 4
epochs = 40

2023-11-30 18:39:17.098104: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
path1 = "archive/topics"
path2 = "archive/forms"

def remove_ending(text):
    lines = text.split('\n')
    filtered_lines = [line for line in lines if not line.strip().startswith(("Copyright", "©", "copyright"))]
    cleaned_text = '\n'.join(filtered_lines)
    return cleaned_text

def find_topics_forms(path, name):
    list_of_dict_of_poems = []
    for root, _, files in os.walk(path):
        for text in files:
            fullpath = root + "/" + text
            topic_or_form = os.path.basename(root)
            with open(fullpath, "r") as myfile:
                poem = myfile.read()
                poem = remove_ending(poem)
                #poem = poem.replace("\n", " <newline> ")
                title = text
                dictionary = {"title" : title , "poem" : poem, name : topic_or_form}
                list_of_dict_of_poems.append(dictionary)
    
    return list_of_dict_of_poems

poem_topic_dictionary = find_topics_forms(path1, "topic")
poem_form_dictionary = find_topics_forms(path2, "form")
print(len(poem_topic_dictionary))

1854


In [3]:
def create_data_table(dictionary, name):
    column_names=["title","poem", name]
    
    df = pd.DataFrame(dictionary, columns=column_names)
    filepath = name + "s_dataset.csv"
    file = df.to_csv(filepath, index=False)
    read_file = pd.read_csv(filepath)

def compare_tables(forms, topics):
    forms_f = pd.read_csv(forms)
    topics_f = pd.read_csv(topics)
    poems_in_forms = forms_f['poem'].tolist()
    poems_in_topics = topics_f['poem'].tolist()
    for poem in poems_in_forms:
        if poem in topics_f['poem'].values:
            topic_value = topics_f[topics_f['poem'] == poem]['topic'].values[0]
            forms_f.loc[forms_f['poem'] == poem, 'topic'] = topic_value
    poems_to_add = [poem for poem in poems_in_topics if poem not in poems_in_forms]
    rows_to_add = topics_f[topics_f['poem'].isin(poems_to_add)]
    merged_df = pd.concat([forms_f, rows_to_add], ignore_index=True)
    merged_df.to_csv('merged_dataset.csv', index=False)

create_data_table(poem_form_dictionary, "form")
create_data_table(poem_topic_dictionary, "topic")
compare_tables('forms_dataset.csv', 'topics_dataset.csv')

In [4]:
df = pd.read_csv("topics_dataset.csv")
df

Unnamed: 0,title,poem,topic
0,AlonePoems06HaikuAlonePoembyjohntiongchunghoo.txt,loneliness\nhis childhood\nwarms him up\nlonel...,alone
1,AlonePoems1999AlonePoembyDeanaRode.txt,Sit in a chair\nalone\nPeople I thought were\n...,alone
2,AlonePoems2006AlonePoembyDeanaRode.txt,Tell me why it has to be this way\nwhy I must ...,alone
3,AlonePoemsADebtorToMercyAlonePoembyAugustusMon...,"A debtor to mercy alone, of covenant mercy I s...",alone
4,AlonePoemsAgainLeftAlonePoembyRaviSathasivam.txt,A year has been passed since I left home\nAgai...,alone
...,...,...,...
1849,ChildrenPoemsMyShadowChildrenPoembyCJHeck.txt,I have a shadow hooked to me.\nSometimes he's ...,children
1850,ChildrenPoemsMyToothChildrenPoembyCJHeck.txt,The toothfairy took my tooth lath night\nand l...,children
1851,ChildrenPoemsNightNightChildrenPoembyCJHeck.txt,Night-night moon\nNight-night stars\nNight-nig...,children
1852,ChildrenPoemsOdeToBeingFiveChildrenPoembyCJHec...,I made a paper Valentine\nall red and edged wi...,children


In [5]:
def split_dataset(data_dictionary):
    
    df = pd.DataFrame(data_dictionary)
    filename = "dataset.csv"
    file = df.to_csv(filename, index=False)
    read_filename = pd.read_csv(filename)

    
    print("total poems:",len(read_filename))
    randomized_data = read_filename.sample(frac=1, random_state=42)
    print("total randomized poems:",len(randomized_data))

    train = int(len(randomized_data) * 0.8)

    train_data = randomized_data[:train]
    print("total training set:",len(train_data))

    test_data = randomized_data[train:]
    print("total testing set:",len(test_data))

    train_filepath = "train_dataset.csv"
    test_filepath = "test_dataset.csv"
    train_data.to_csv(train_filepath, index=False)
    test_data.to_csv(test_filepath, index=False)

In [6]:
split_dataset(data_dictionary=poem_topic_dictionary)

df = pd.read_csv("train_dataset.csv")

total poems: 1854
total randomized poems: 1854
total training set: 1483
total testing set: 371


In [7]:
df = pd.read_csv("train_dataset.csv")
df

Unnamed: 0,title,poem,topic
0,AngelPoemsBoyAndTheAngelThePoembyRobertBrownin...,"Morning, evening, noon and night,\n``Praise Go...",angel
1,AnimalPoemsEarthlyAnimalPoembyHasmukhAmathalal...,It is beautiful wish\nThat finishes\nWith welf...,animal
2,ButterflyPoemsButterflyPoembyMaryWismer.txt,Went down to the garden\nand see a Caterppilar...,butterfly
3,ChildhoodPoemsAChildhoodFriendComesVisitingHai...,"Hear someone giggle,\nPeer out, there is no on...",childhood
4,AnimalPoemsAnimalCrackersPoembyChristineKerr.txt,"Can you enjoy a meal,\nIf it ain't your Cat.\n...",animal
...,...,...,...
1478,AlonePoemsAloneICannotBePoembyEmilyDickinson.txt,"298\nAlone, I cannot be—\nFor Hosts—do visit m...",alone
1479,BirthPoemsSonnetOnTheMorningOfChristsBirthPoem...,This starry dawn – the wise men yet afar –\nth...,birth
1480,ChildrenPoemsGladChildrenPoembyCJHeck.txt,Being glad is simple\nand sometimes glad is gr...,children
1481,BabyPoemsCarolineJustTheirBabyGirlPoembyScarle...,hello world\nyou may not know me\nbut I know y...,baby


In [8]:
def encode_labels(label_set):
    label_encoder = preprocessing.LabelEncoder()
    input_classes = label_set
    label_encoder.fit(input_classes)
    int_labels = label_encoder.transform(input_classes)
    return int_labels

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
special_tokens_dict = {
    'bos_token': '<BOS>', 
    'eos_token': '<EOS>', 
    'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

In [10]:
class GPT2_Dataset(Dataset):
    def __init__(self, file, tokenizer, max_length = max_len):
        self.file = file
        self.tokenizer = tokenizer
        
        self.data = pd.read_csv(file)
        self.poems = [poem for poem in self.data['poem'].tolist() if len(poem) <= 300]
        self.labels = self.data['topic'] if 'topic' in self.data else None

        self.input_ids = []
        self.attention_masks = []
        for poem in self.poems:
            encodings_dict = tokenizer('<BOS>' + poem + '<EOS>',
                                         truncation=True,
                                         max_length=max_length,
                                         padding='max_length')
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attention_masks.append(torch.tensor(encodings_dict['attention_mask']))

        self.int_labels = encode_labels(self.labels)
        
    def __len__(self):
         return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.int_labels[idx]

In [11]:
train_dataset = GPT2_Dataset("train_dataset.csv", tokenizer=tokenizer)
test_dataset = GPT2_Dataset("test_dataset.csv", tokenizer=tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#dataloader = DataLoader(topics_dataset, sampler=RandomSampler(topics_dataset), batch_size=batch_size)

In [12]:
print('pad_index =', tokenizer.pad_token_id)
for i, batch in enumerate(train_dataloader):
    print(len(train_dataloader))
    input_ids = batch[0]
    attention_mask = batch[1]
    labels = batch[2]
    print(batch)
    break

pad_index = 50259
58
[tensor([[50257,    67,  2265,  ..., 50259, 50259, 50259],
        [50257,  2514,   307,  ..., 50259, 50259, 50259],
        [50257, 39079,   510,  ..., 50259, 50259, 50259],
        [50257, 48787,   262,  ..., 50259, 50259, 50259]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([ 7,  4,  9, 18])]


In [13]:
#for poem in topics_dataset.poems:
#   print(len(poem))
len(train_dataset)

229

In [14]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)

In [15]:
configuration = GPT2Config(vocab_size=len(tokenizer)).from_pretrained('gpt2', output_hidden_states=True)
poem_model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
poem_model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [16]:
optimizer = AdamW(poem_model.parameters(), lr=learning_rate, eps=eps)

In [17]:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

In [18]:
warmup_steps = 50
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)

In [19]:
#before training
if tokenizer.pad_token is None:
           tokenizer.pad_token = tokenizer.eos_token
raw_model =  GPT2LMHeadModel.from_pretrained('gpt2')
prompt = "butterfly"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs = raw_model.generate(input_ids, pad_token_id=tokenizer.pad_token_id, do_sample=True, max_length=30)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

["butterfly\n\nChrysler's Jet is an upmarket car from a small, locally owned company that's already producing pretty good cars:"]

In [20]:
model = poem_model.to(device)

#model = nn.DataParallel(model)

for epoch in range(epochs):
    print(f'Epoch {epoch + 1} of {epochs}')
    total_loss = 0
    model.train()
    for i, batch in enumerate(train_dataloader): 
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        model.zero_grad()   
        
        outputs = model(input_ids, 
                        attention_mask=attention_mask,
                        token_type_ids=None)
        #print(outputs[0])
        loss = outputs[0]
        mean_loss = torch.mean(loss)
        #print(f"mean loss: {mean_loss}")
        batch_loss = loss.detach().cpu().numpy()
        total_loss += batch_loss
        
        loss.sum().backward()
        optimizer.step()
        scheduler.step()
        
        optimizer.zero_grad()

Epoch 1 of 40
Epoch 2 of 40
Epoch 3 of 40
Epoch 4 of 40
Epoch 5 of 40
Epoch 6 of 40
Epoch 7 of 40
Epoch 8 of 40
Epoch 9 of 40
Epoch 10 of 40
Epoch 11 of 40
Epoch 12 of 40
Epoch 13 of 40
Epoch 14 of 40
Epoch 15 of 40
Epoch 16 of 40
Epoch 17 of 40
Epoch 18 of 40
Epoch 19 of 40
Epoch 20 of 40
Epoch 21 of 40
Epoch 22 of 40
Epoch 23 of 40
Epoch 24 of 40
Epoch 25 of 40
Epoch 26 of 40
Epoch 27 of 40
Epoch 28 of 40
Epoch 29 of 40
Epoch 30 of 40
Epoch 31 of 40
Epoch 32 of 40
Epoch 33 of 40
Epoch 34 of 40
Epoch 35 of 40
Epoch 36 of 40
Epoch 37 of 40
Epoch 38 of 40
Epoch 39 of 40
Epoch 40 of 40


In [21]:
#after fine-tuning
model.eval()
prompt = "children"
if tokenizer.pad_token is None:
           tokenizer.pad_token = tokenizer.eos_token

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(input_ids.to(device), pad_token_id=tokenizer.pad_token_id, do_sample=True, max_length=30)
tokenizer.batch_decode(outputs, skip_special_tokens=True)



['children']