In [8]:
# import tensorflow as tf
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
from string import punctuation
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
import torch
print(torch.cuda.is_available())
torch.cuda.get_device_name(0)

ModuleNotFoundError: No module named 'numpy'

In [15]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
# if torch.cuda.is_available():
#     model = model.cuda()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [16]:
from sklearn.model_selection import train_test_split

# Read the data from the text file
with open('drake-lyrics.txt', 'r') as file:
    data = file.read().splitlines()


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


# To see the result
print(f"Training set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")

# Optionally, you can save these to new files
with open('train_lyrics.txt', 'w') as file:
    file.write('\n'.join(train_data))

with open('test_lyrics.txt', 'w') as file:
    file.write('\n'.join(test_data))

Training set size: 6240
Test set size: 1561


In [17]:
with open('train_lyrics.txt', 'w') as file:
    for line in train_data:
        file.write("%s\n" % line)

with open('test_lyrics.txt', 'w') as file:
    for line in test_data:
        file.write("%s\n" % line)

print("Training and test data have been saved to 'train_lyrics.txt' and 'test_lyrics.txt', respectively.")


Training and test data have been saved to 'train_lyrics.txt' and 'test_lyrics.txt', respectively.


In [20]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=128
    )

    test_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=test_path,
        block_size=128
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_path = 'train_lyrics.txt'
test_path = 'test_lyrics.txt'

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)

training_args = TrainingArguments(
    output_dir="./gpt2-lyrics",
    overwrite_output_dir=True,
    num_train_epochs=300, #
    per_device_train_batch_size=12, # Adjust based on your GPU memory
    per_device_eval_batch_size=8,
    eval_steps=100,
    save_steps=800,
    warmup_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [21]:
trainer.train()

                                                     
  1%|          | 132/12900 [1:00:06<9:01:27,  2.54s/it]

{'loss': 3.8867, 'grad_norm': 10.524933815002441, 'learning_rate': 5e-05, 'epoch': 11.63}


                                                       
  1%|          | 132/12900 [1:54:15<9:01:27,  2.54s/it]  

{'loss': 2.5482, 'grad_norm': 6.154247760772705, 'learning_rate': 4.7983870967741937e-05, 'epoch': 23.26}


                                                       
  1%|          | 132/12900 [2:48:24<9:01:27,  2.54s/it]  

{'loss': 1.3685, 'grad_norm': 3.7633893489837646, 'learning_rate': 4.596774193548387e-05, 'epoch': 34.88}


                                                       
  1%|          | 132/12900 [3:42:31<9:01:27,  2.54s/it]  

{'loss': 0.7909, 'grad_norm': 2.8142669200897217, 'learning_rate': 4.395161290322581e-05, 'epoch': 46.51}


                                                       
  1%|          | 132/12900 [4:36:46<9:01:27,  2.54s/it]  

{'loss': 0.4268, 'grad_norm': 1.907772183418274, 'learning_rate': 4.1935483870967746e-05, 'epoch': 58.14}


                                                       
  1%|          | 132/12900 [5:30:53<9:01:27,  2.54s/it]  

{'loss': 0.1961, 'grad_norm': 1.4882735013961792, 'learning_rate': 3.991935483870968e-05, 'epoch': 69.77}


                                                       
  1%|          | 132/12900 [6:25:00<9:01:27,  2.54s/it]  

{'loss': 0.1105, 'grad_norm': 1.2129045724868774, 'learning_rate': 3.7903225806451614e-05, 'epoch': 81.4}


                                                       
  1%|          | 132/12900 [7:19:05<9:01:27,  2.54s/it]  

{'loss': 0.0764, 'grad_norm': 0.763961911201477, 'learning_rate': 3.5887096774193555e-05, 'epoch': 93.02}


                                                       
  1%|          | 132/12900 [8:13:14<9:01:27,  2.54s/it]  

{'loss': 0.0591, 'grad_norm': 0.7951975464820862, 'learning_rate': 3.387096774193548e-05, 'epoch': 104.65}


                                                       
  1%|          | 132/12900 [9:07:22<9:01:27,  2.54s/it]  

{'loss': 0.0488, 'grad_norm': 0.7560718059539795, 'learning_rate': 3.185483870967742e-05, 'epoch': 116.28}


                                                       
  1%|          | 132/12900 [10:01:29<9:01:27,  2.54s/it] 

{'loss': 0.0433, 'grad_norm': 0.6641001105308533, 'learning_rate': 2.9838709677419357e-05, 'epoch': 127.91}


                                                        
  1%|          | 132/12900 [10:55:36<9:01:27,  2.54s/it]  

{'loss': 0.038, 'grad_norm': 0.5545201301574707, 'learning_rate': 2.7822580645161288e-05, 'epoch': 139.53}


                                                        
  1%|          | 132/12900 [11:49:44<9:01:27,  2.54s/it]  

{'loss': 0.0356, 'grad_norm': 0.5779292583465576, 'learning_rate': 2.5806451612903226e-05, 'epoch': 151.16}




KeyboardInterrupt: 

In [22]:
# After training is complete
model.save_pretrained('./gpt2-lyrics')
tokenizer.save_pretrained('./gpt2-lyrics')

('./gpt2-lyrics\\tokenizer_config.json',
 './gpt2-lyrics\\special_tokens_map.json',
 './gpt2-lyrics\\vocab.json',
 './gpt2-lyrics\\merges.txt',
 './gpt2-lyrics\\added_tokens.json')

In [6]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# model_path = './gpt2-lyrics'
model_path = './gpt2-lyrics'  # Updated path to the most recent checkpoint


# Load pre-trained model tokenizer (vocabulary) and model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Encode text input to tensor
prompt = "some hot head"
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text
# Note: You might want to adjust the generation parameters like max_length, num_beams, etc., based on your needs.
max_length = 100
# Ensure model is in evaluation mode for inference
model.eval()

# Move model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    input_ids = input_ids.cuda()

with torch.no_grad():
    output = model.generate(input_ids, max_length=max_length)

# Decode text output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(output[0])


ModuleNotFoundError: No module named 'torch'

In [8]:
print(input_ids)

tensor([[11246,  3024,  1182]], device='cuda:0')


In [14]:
print(output.shape[1])

100


In [41]:
import re

output_tokens = tokenizer.convert_ids_to_tokens(output[0].tolist(), skip_special_tokens=True)

pattern = re.compile(r'^[^a-zA-Z0-9]')

cleaned_output_tokens = [pattern.sub('', token) for token in output_tokens]

['some', 'hot', 'head', 'for', 'someone', 'else', '', 'I', 'say', '', '"', 'fuck', 'that', 'nig', 'ga', 'that', 'you', 'think', 'you', 'found', '', 'G', 'odd', 'amn', '', 'we', 'ain', 't', 'even', 'gotta', 'scam', '', 'N', 'aked', 'women', 'swimming', 'that', 's', 'just', 'how', 'I', 'm', 'living', '', 'I', 'mean', 'for', 'tonight', 'ho', '', 'Every', 'single', 'show', 'she', 'out', 'there', 're', 'pp', 'in', '', 'like', 'a', 'mascot', '', 'You', 'may', 'not', 'hear', 'from', 'them', 'ever', 'again', '', 'No', 'I', 'do', 'not', 'have', 'a', 'ch', 'aper', 'one', '', 'Talking', 'bullshit', 'as', 'if', 'it', 'was', 'for', 'you', 'to', 'know', '', 'who', 'told', 'the', 'cops', 'where', 'the']


In [51]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Ensure you have the necessary NLTK data
nltk.download('punkt')

# Load tokenizer for GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def file_to_token_lists_and_decode(file_path):
    token_lists = []  # This will store the lists of tokens
    decoded_texts = []  # This will store the decoded texts for verification
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove any extraneous whitespace
            if line:  # Check if line is not empty
                tokens = tokenizer.encode(line)
                decoded_text = tokenizer.decode(tokens)
                token_lists.append(tokens)
                decoded_texts.append(decoded_text.split())  # Split into words for BLEU computation
    return token_lists, decoded_texts

def compute_bleu(model_output, reference_texts):
    # Assume model_output is a single string of generated text
    model_output_tokens = model_output.split()  # Split into words
    # Calculate BLEU score
    bleu_score = sentence_bleu(reference_texts, model_output_tokens)
    return bleu_score

# Example usage
file_path = 'drake-lyrics.txt'
token_lists, decoded_texts = file_to_token_lists_and_decode(file_path)

# Example model output
model_output = generated_text

# Compute BLEU score
bleu_score = compute_bleu(model_output, decoded_texts)
print("BLEU Score:", bleu_score)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Max\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TypeError: Fraction.__new__() got an unexpected keyword argument '_normalize'

ImportError: cannot import name '__version__' from 'os' (c:\Users\Max\anaconda3\envs\aiTorch\Lib\os.py)

In [12]:
len(generated_text.split())

76

In [8]:
# from transformers import pipeline

# model_path = './gpt2-lyrics'

# # Initialize the pipeline with your model
# lyrics_generator = pipeline('text-generation', model=model_path, tokenizer=model_path)

# # Generate text
# prompt = "I need a one dance! "
# results = lyrics_generator(prompt, max_length=50)[0]['generated_text']

# print(results)

RuntimeError: Failed to import transformers.models.gpt2.modeling_tf_gpt2 because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [7]:
emb = model.transformer.wte.weight.data.shape

torch.Size([50257, 768])


In [20]:
model.transformer.wte(input_ids).squeeze(0).shape

torch.Size([3, 768])

In [24]:
model.transformer.wte(output[input_ids.shape[1]:]).squeeze(0).shape

torch.Size([0, 100, 768])

In [22]:
input_ids

tensor([[11246,  3024,  1182]], device='cuda:0')

In [28]:
output.shape[:, input_ids.shape[1]:]

TypeError: tuple indices must be integers or slices, not tuple

In [26]:
input_ids.shape

torch.Size([1, 3])