In [2]:
from transformers import XLNetTokenizer, XLNetLMHeadModel

model_name = 'xlnet-base-cased'  # You can choose a different model size based on your needs

tokenizer = XLNetTokenizer.from_pretrained(model_name)
model = XLNetLMHeadModel.from_pretrained(model_name)


In [4]:
import torch
print(torch.cuda.is_available())
torch.cuda.get_device_name(0)

True


'NVIDIA GeForce RTX 3050 Laptop GPU'

In [7]:
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLNetTokenizer, XLNetLMHeadModel

class LyricsDataset(Dataset):
    def __init__(self, tokenizer, filename, block_size=128):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.examples = []
        
        with open(filename, encoding="utf-8") as f:
            for line in f.read().splitlines():
                if len(line) > 0 and not line.isspace():
                    # Remove punctuation and special characters
                    cleaned_line = re.sub(r'[^A-Za-z0-9\s]+', '', line.lower())
                    tokenized_line = tokenizer.encode(cleaned_line, add_special_tokens=True,max_length=self.block_size,padding='max_length')
                    for i in range(0, len(tokenized_line), block_size):
                        self.examples.append(tokenized_line[i:i+block_size])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

# Example usage
dataset = LyricsDataset(tokenizer, "drake-lyrics.txt")
data_loader = DataLoader(dataset, batch_size=20, shuffle=True)


In [13]:
from transformers import AdamW
from tqdm import tqdm
import numpy as np

#CAN LOAD BEST MODEL
model.load_state_dict(torch.load('best_model_state_dict.pth'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-05)

model.train()
best_loss = 0.5606599689194087
for epoch in range(100):
    progress_bar = tqdm(data_loader, desc="Epoch {:1d}".format(epoch), leave=False, disable=False)
    total_loss = 0
    for batch in progress_bar:
        inputs = batch.to(device)
        outputs = model(inputs, labels=inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    average_loss = total_loss / len(data_loader)
    if average_loss < best_loss:                   
        best_loss = average_loss
        torch.save(model.state_dict(), 'best_model_state_dict.pth')  # Save the state_dict to a file
        print(f"New best model saved at epoch {epoch} with loss: {best_loss}")
    print(f"Epoch {epoch}: Average Loss: {average_loss}")

                                                          

New best model saved at epoch 0 with loss: 0.5719664131698502
Epoch 0: Average Loss: 0.5719664131698502


                                                          

New best model saved at epoch 1 with loss: 0.5698214511187296
Epoch 1: Average Loss: 0.5698214511187296


                                                          

New best model saved at epoch 2 with loss: 0.56962698615029
Epoch 2: Average Loss: 0.56962698615029


                                                          

New best model saved at epoch 3 with loss: 0.5686038627099858
Epoch 3: Average Loss: 0.5686038627099858


                                                          

Epoch 4: Average Loss: 0.5686482434319254


                                                          

Epoch 5: Average Loss: 0.5686221803463269


                                                          

New best model saved at epoch 6 with loss: 0.5680861846650211
Epoch 6: Average Loss: 0.5680861846650211


                                                          

New best model saved at epoch 7 with loss: 0.5672528997769263
Epoch 7: Average Loss: 0.5672528997769263


                                                          

New best model saved at epoch 8 with loss: 0.567202661445878
Epoch 8: Average Loss: 0.567202661445878


                                                          

Epoch 9: Average Loss: 0.567256504031609


                                                           

New best model saved at epoch 10 with loss: 0.5669761946772467
Epoch 10: Average Loss: 0.5669761946772467


                                                           

New best model saved at epoch 11 with loss: 0.5665081394083985
Epoch 11: Average Loss: 0.5665081394083985


                                                           

Epoch 12: Average Loss: 0.5671756651076102


                                                           

New best model saved at epoch 13 with loss: 0.5659594774080187
Epoch 13: Average Loss: 0.5659594774080187


                                                           

New best model saved at epoch 14 with loss: 0.5656721594439907
Epoch 14: Average Loss: 0.5656721594439907


                                                           

New best model saved at epoch 15 with loss: 0.5654994080823776
Epoch 15: Average Loss: 0.5654994080823776


                                                           

Epoch 16: Average Loss: 0.5657414580620099


                                                           

New best model saved at epoch 17 with loss: 0.5651421903899785
Epoch 17: Average Loss: 0.5651421903899785


                                                           

New best model saved at epoch 18 with loss: 0.5650768007740669
Epoch 18: Average Loss: 0.5650768007740669


                                                           

New best model saved at epoch 19 with loss: 0.5644555938609132
Epoch 19: Average Loss: 0.5644555938609132


                                                           

New best model saved at epoch 20 with loss: 0.5643298589419522
Epoch 20: Average Loss: 0.5643298589419522


                                                           

New best model saved at epoch 21 with loss: 0.5639347429062995
Epoch 21: Average Loss: 0.5639347429062995


                                                           

Epoch 22: Average Loss: 0.5640960074234805


                                                           

New best model saved at epoch 23 with loss: 0.5636374287120479
Epoch 23: Average Loss: 0.5636374287120479


                                                           

New best model saved at epoch 24 with loss: 0.563217020051393
Epoch 24: Average Loss: 0.563217020051393


                                                           

Epoch 25: Average Loss: 0.5635982479557685


                                                           

New best model saved at epoch 26 with loss: 0.5626015327767078
Epoch 26: Average Loss: 0.5626015327767078


                                                           

Epoch 27: Average Loss: 0.5630496315305279


                                                           

Epoch 28: Average Loss: 0.5628218965444061


                                                           

Epoch 29: Average Loss: 0.5632692993019285


                                                           

Epoch 30: Average Loss: 0.5633524990845523


                                                           

Epoch 31: Average Loss: 0.5632336738215847


                                                           

Epoch 32: Average Loss: 0.5626646705324604


                                                           

New best model saved at epoch 33 with loss: 0.5624514493603561
Epoch 33: Average Loss: 0.5624514493603561


                                                           

New best model saved at epoch 34 with loss: 0.5618464186331024
Epoch 34: Average Loss: 0.5618464186331024


                                                           

New best model saved at epoch 35 with loss: 0.561660393200877
Epoch 35: Average Loss: 0.561660393200877


                                                           

Epoch 36: Average Loss: 0.5619046810111628


                                                           

New best model saved at epoch 37 with loss: 0.5606599689194087
Epoch 37: Average Loss: 0.5606599689194087


                                                           

Epoch 38: Average Loss: 0.5607742987635408


                                                          

KeyboardInterrupt: 

In [15]:
def generate_lyric_continuation(model, tokenizer, prompt, max_length=100, temperature=2.0, top_p=0.92, num_return_sequences=50):
    """
    Generate a continuation for a given lyric prompt using the specified model and tokenizer.

    Parameters:
    - model: The pre-trained language model.
    - tokenizer: The tokenizer corresponding to the model.
    - prompt: The initial string of the lyric to generate the continuation for.
    - max_length: The maximum length of the sequence to generate.
    - temperature: Controls the randomness of the generation. Lower values make the generation more deterministic.
    - top_p: Nucleus sampling parameter, controlling the diversity of the generation.
    - num_return_sequences: The number of sequences to generate.

    Returns:
    A list of generated lyric continuations.
    """
    # Encode the prompt text
    input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors='pt').to(model.device)
    
    # Generate sequences
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=max_length + len(input_ids[0]),
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        num_return_sequences=num_return_sequences,
    )
    
    # Decode the generated sequences to strings
    generated_lyrics = [tokenizer.decode(seq, skip_special_tokens=True) for seq in output_sequences]
    
    return generated_lyrics

# Example usage

#CAN LOAD BEST MODEL
model.load_state_dict(torch.load('best_model_state_dict.pth'))

prompt = "I need one dance"
generated_continuations = generate_lyric_continuation(model, tokenizer, prompt, max_length=50, temperature=1.5, num_return_sequences=20)

for i, continuation in enumerate(generated_continuations, 1):
    print(f"Generated Lyric {i}: {continuation}\n")

Generated Lyric 1: I need one dance for im knowi that you so  for just the met you toi in that now on

Generated Lyric 2: I need one dance the all that it allii myss yeah go don

Generated Lyric 3: I need one dance its  this  my love to know get at shit and us up me it im girl

Generated Lyric 4: I need one dance the in im me i up im im the  now the me and for the a with shit on for

Generated Lyric 5: I need one dance is my imi just one and that is at the gets goi thatt on

Generated Lyric 6: I need one dance the likes up all down just in that im on shit this  you down we to shit they go me

Generated Lyric 7: I need one dance the it shit what a and thei the a to the for for the know right me now you they with down

Generated Lyric 8: I need one dance  and they cause up we tos and shitis up is your  and like

Generated Lyric 9: I need one dance we so like the is to  when she this just to and to in on  all on  a

Generated Lyric 10: I need one dance it that itst love with with got im m

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
import re

pattern = re.compile(r'^[^a-zA-Z0-9]')

def file_to_token_lists_and_decode(file_path):
    decoded_texts = []  # This will store the decoded texts for verification
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove any extraneous whitespace
            if line:  # Check if line is not empty
                tokens = tokenizer.encode(line)
                output_tokens = tokenizer.convert_ids_to_tokens(tokens, skip_special_tokens=True)
                decoded = [pattern.sub('', token) for token in output_tokens]
                decoded_texts.append(decoded)  # Split into words for BLEU computation
    return decoded_texts

def compute_bleu(model_output, reference_texts):
    # Assume model_output is a single string of generated text
    model_output_tokens = model_output.split()  # Split into words
    # Calculate BLEU score
    bleu_score = sentence_bleu(reference_texts, model_output_tokens)
    return bleu_score

# Example usage
file_path = 'drake-lyrics.txt'
decoded_texts = file_to_token_lists_and_decode(file_path)

# Compute BLEU score
bleu_score = compute_bleu(generated_text, decoded_texts)
print(f'BLEU Score: {bleu_score}')