In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# General Imports

In [2]:
!pip install GPUtil                   # to check GPU usage
!pip install -q bs4                   # to clean the lyrics
!pip install -q datasets
!pip install transformers             # for training
# Update Transformers from the source if you get errors:
# !pip install git + https://github.com/huggingface/transformers
!pip install tf-models-official       # for optimization
!pip install pyyaml h5py              # to save the models

"""
If you have ongoing issues with the transformers library, Also consider installing from source: 
git clone https://github.com/huggingface/transformers.git 
cd transformers (in Dropbox/C/25_Studium_Master/TUM_RCI/2021_WS/NLP/3-Exercises-Workspace/transformers)
pip install -e
Also update git
"""

Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7411 sha256=950588da641b98eca955157ea90c9a30900ddba2a1642dd9a1aee2576726cc33
  Stored in directory: /root/.cache/pip/wheels/6e/f8/83/534c52482d6da64622ddbf72cd93c35d2ef2881b78fd08ff0c
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0
[K     |████████████████████████████████| 312 kB 9.7 MB/s 
[K     |████████████████████████████████| 212 kB 88.0 MB/s 
[K     |████████████████████████████████| 134 kB 97.1 MB/s 
[K     |████████████████████████████████| 1.1 MB 80.7 MB/s 
[K     |████████████████████████████████| 67 kB 6.1 MB/s 
[K     |████████████████████████████████| 127 kB 93.4 MB/s 
[K     |████████████████████████████████| 144 kB 77.6 MB/s 
[K     |████████████████████████████████| 94 kB 3.7

'\nIf you have ongoing issues with the transformers library, Also consider installing from source: \ngit clone https://github.com/huggingface/transformers.git \ncd transformers (in Dropbox/C/25_Studium_Master/TUM_RCI/2021_WS/NLP/3-Exercises-Workspace/transformers)\npip install -e\nAlso update git\n'

In [3]:
import tensorflow as tf
from tensorflow import keras
from transformers import AutoConfig, AutoTokenizer, TFAutoModelForCausalLM, default_data_collator, pipeline 
# DefaultDataCollator: for the error https://github.com/huggingface/transformers/pull/5015
from official.nlp import optimization
from datasets import Dataset      # https://huggingface.co/docs/datasets/
from bs4 import BeautifulSoup     # easy webscraping
import requests                   # for sending http requests
import re                         # regular expressions, excellent for string searching/replacing, 
# extremely powerful for manual NLP, check out the documentation at https://docs.python.org/3/library/re.html
import os                         # accessing file paths
# os.environ["CUDA_VISIBLE_DEVICES"]="-1" to deactive GPU
import json
import random
from collections import Counter
import math
import GPUtil
from numba import cuda 

model_name = "gpt2"               # selected pretrained language model from https://huggingface.co/models

# "gtp3" is huge, needed several GPUs for traning, didnt fit in to Colab Pro Machine. 
# I tried distilgpt2 on my machine. But even then I ran in to problems with OOM 
# errors often before switching to Google Colab Pro (10euro/month) for efficiency
# as I only have a GeForce GTX M 860 with 4 GB memory. I didn't try distilgpt2
# here again as I direct could use gpt2.

In [4]:
from tensorflow import keras
GPUtil.showUtilization()

| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |


# Generator <a class="anchor" id="Generator"></a>

In [43]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: Changed the directories to save genres separately 

# loading saved lyrics files

# Utilized commenting to train 6 different models for every genre instead of 
# having 6 notebooks. As I save them after training reloading and generating lyrics 
# in all genres is no issue. 

#with open("/content/drive/MyDrive/EchoCanyon/lyrics/blues_lyrics.json", "r", encoding = "utf8") as f:
#with open("/content/drive/MyDrive/EchoCanyon/lyrics/country_lyrics.json", "r", encoding = "utf8") as f:
#with open("/content/drive/MyDrive/EchoCanyon/lyrics/jazz_lyrics.json", "r", encoding = "utf8") as f:
with open("/content/drive/MyDrive/EchoCanyon/lyrics/metal_lyrics.json", "r", encoding = "utf8") as f:
#with open("/content/drive/MyDrive/EchoCanyon/lyrics/pop_lyrics.json", "r", encoding = "utf8") as f:
#with open("/content/drive/MyDrive/EchoCanyon/lyrics/rock_lyrics.json", "r", encoding = "utf8") as f:

    lyrics_dict = {}
    lyrics_dict = json.load(f)

In [44]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# reformatting lyrics

lyrics = {"text":[], "artist":[]}
for artist, titles in lyrics_dict.items():
    
    for title, text in titles.items():
        
        lyrics["artist"].append(artist)
        lyrics["text"].append(text)

In [45]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# AutoTokenizer will find and load the right tokenizer for all common models in the huggingface library
tokenizer = AutoTokenizer.from_pretrained(model_name)

i = random.randint(0, len(lyrics["text"]))
example_text = lyrics["text"][i][0:200]
example_text_tokenized = tokenizer(example_text)
print([tokenizer.decode(token) for token in example_text_tokenized["input_ids"]])

['[', ' 1', ']', '\n', 'Open', ' door', ',', ' so', ' I', ' walk', ' inside', '\n', 'Close', ' my', ' eyes', ',', ' find', ' my', ' place', ' to', ' hide', '\n', 'And', ' I', ' shake', ' as', ' I', ' take', ' it', ' in', '\n', 'Let', ' the', ' show', ' begin', '\n', 'Open', ' my', ' eyes', ' just', ' to', ' have', ' them', ' close', ' again', '\n', 'Well', ' on', ' my', ' way', ',', ' but', ' on', ' my', ' way', ' to', ' where', ' I']


In [46]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# transforming lyrics into dataset format

dataset = Dataset.from_dict(lyrics)
print(dataset)

Dataset({
    features: ['text', 'artist'],
    num_rows: 725
})


In [47]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# tokenizing before chunking, to ensure equal lengths

def pretokenization(text, tokenizer):
    tokens = tokenizer(text, truncation = False, add_special_tokens = False, return_attention_mask = False)
    return {"tokens":[tokenizer.decode(token) for token in tokens["input_ids"]]}

tokenized_dataset = dataset.map(lambda x: pretokenization(x["text"],tokenizer), remove_columns = ["text"])
print(tokenized_dataset)
i = random.randint(0, len(dataset["text"]))
print(tokenized_dataset["tokens"][i][0:20])

0ex [00:00, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1025 > 1024). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['artist', 'tokens'],
    num_rows: 725
})
['Free', ' fall', ' through', ' our', ' midnight', '\n', 'This', ' ep', 'il', 'ogue', ' of', ' our', ' own', ' f', 'able', '\n', 'He', 'ed', 'less', ' in']


In [48]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# chunking the lyrics into parts of equal length
# greater chunks allow longer-range dependencies but also increase computation 
chunk_size = 25  # increases computation quadraticly, basically
# depens on how many close words to consider during training over the current word 
#

def chunking(examples, chunk_size):
    chunks = []
    artists = []
    for tokens, artist in zip(examples["tokens"], examples["artist"]):

        for i in range(0, len(tokens)-chunk_size, chunk_size):
            
            chunks.append(tokens[i:i+chunk_size])
            artists.append(artist)
    
    return {'tokens': chunks, "artist": artists}

chunked_dataset = tokenized_dataset.map(lambda x: chunking(x, chunk_size), batched = True, 
                                        remove_columns = tokenized_dataset.column_names)

# check if all chunks are of equal length
print(max(len(ids) for ids in chunked_dataset["tokens"]))
print(min(len(ids) for ids in chunked_dataset["tokens"]))

print(chunked_dataset)

  0%|          | 0/1 [00:00<?, ?ba/s]

25
25
Dataset({
    features: ['artist', 'tokens'],
    num_rows: 8519
})


In [49]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# stratifying by artist

n = Counter(chunked_dataset["artist"]).most_common()[-1][1]  # from library->Collections

def stratifying(examples, n):  # to have same amount of lines from both artists
    
    chunks = []
    artists = []
    
    for artist in set(examples["artist"]):
        
        artist_chunks = [chunk for a, chunk in zip(examples["artist"], examples["tokens"]) if a == artist]
        artist_chunks = random.sample(artist_chunks, n)
        
        for chunk in artist_chunks:
            
            chunks.append(chunk)
            artists.append(artist)
        
        
    return {'tokens': chunks, "artist": artists}
    
stratified_dataset = chunked_dataset.map(lambda x: stratifying(x, n), batched = True, batch_size = None,
                                        remove_columns = tokenized_dataset.column_names)

print(stratified_dataset)
print(Counter(stratified_dataset["artist"]).most_common())

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['artist', 'tokens'],
    num_rows: 5055
})
[('megadeth', 1011), ('metallica', 1011), ('dio', 1011), ('tool', 1011), ('black_sabbath', 1011)]


In [50]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# running tokenization again, to retrieve input_ids, labels and the attention_masks

def tokenization(tokens):
    tokenized = tokenizer(tokens)
    return  {"input_ids": tf.squeeze(tokenized["input_ids"]),
             # duplicating the inputs for our labels
             # "the model of the 🤗 Transformers library apply the shifting to the right, so we don't need to do it manually"
             "labels": tf.squeeze(tokenized["input_ids"]), 
             "attention_mask": tf.squeeze(tokenized["attention_mask"])}

final_dataset = stratified_dataset.map(lambda x:tokenization(x["tokens"]))
print(final_dataset)

0ex [00:00, ?ex/s]

Dataset({
    features: ['artist', 'tokens', 'input_ids', 'labels', 'attention_mask'],
    num_rows: 5055
})


In [51]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# shuffling, batching, splitting  # needed when using tf and it needs to be transfored to a Tensorflow Dataset
from transformers import DefaultDataCollator

batch_size = 2
n_batches = round(final_dataset.num_rows/batch_size)
n_eval_batches = round(0.1*n_batches)
n_train_batches = n_batches - n_eval_batches


# transforming to a tensorflow dataset
data_collator = DefaultDataCollator(return_tensors= "tf")

tf_dataset = final_dataset.to_tf_dataset(columns=["attention_mask", "input_ids", "labels"],
                                                                     shuffle = True, 
                                                                     batch_size = batch_size,
                                                                     collate_fn = data_collator)

# !!!!! Important: setting the seed does not prevent different shuffling at each epoch, 
# !!!!! set reshuffle_each_iteration=False if used


tf_train_dataset = tf_dataset.take(n_train_batches)
tf_eval_dataset = tf_dataset.skip(n_train_batches).take(n_eval_batches)

print(tf_train_dataset)

<TakeDataset element_spec={'labels': TensorSpec(shape=(2, None), dtype=tf.int64, name=None), 'input_ids': TensorSpec(shape=(2, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(2, None), dtype=tf.int64, name=None)}>


In [52]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# inspecting the configuration of the pretrained model
config = AutoConfig.from_pretrained(model_name)

# for more elaboration of all the configuration details check here:
# https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Config
# print(config)

# "n_ctx": 1024, sequence dim
# "n_embd": 768, embedding dim
# "n_head": 12, num of attetion heads

In [53]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# loading the model including its causal language modeling head

model = TFAutoModelForCausalLM.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [54]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

# testing the pretrained model as is

generator = pipeline('text-generation',model=model, tokenizer=tokenizer) 
# for more detailed tuning you have to implement differently
# for that the tokenization works
# for different amount of randomness etc.


prompt = "Peace sells but who is buying"

print(generator(prompt)[0]["generated_text"])
# so you can see it wasnt trained on lyrics

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


Peace sells but who is buying it?

It's hard to define the difference between "bespoke" and "snowflake" or between anything that really tastes like Apple. That is, if you are trying to decide whether to buy


In [55]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: Hyperparameter Tuning see next section

# initializing the optimizer and defining our learning rate schedule

epochs = 50
num_train_steps = epochs * n_train_batches
warmup_ratio = 0.1
num_warmup_steps = int(warmup_ratio*num_train_steps)

init_lr = 1e-3

optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

# compiling the model
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [56]:
history = model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs = epochs)
    

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [57]:
GPUtil.showUtilization()

| ID | GPU | MEM |
------------------
|  0 | 61% | 53% |


# Saving the genre model


In [58]:
#genre_names = ["blues", "country", "jazz", "metal", "pop", "rock"] 
#genre_names[5]

In [59]:
# All the above cells are run 6 times with the same hyperparameters that are obtained 
# using the pop_genre as context data (with most lines) to save time and have 
# comparable results of syntax and semantics

# from https://huggingface.co/docs/transformers/model_sharing

#model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/blues"
#model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/country"
#model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/jazz"
model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/metal"
#model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/pop" 
#model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/rock"

In [60]:
#https://huggingface.co/docs/transformers/model_sharing

model.save_pretrained(model_path)

In [61]:
DONE

NameError: ignored

## Test

In [None]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: None

"""prompt = "What did the chicken say afer crossing the road"
n_generations = 5 # runs 

lyrics = prompt

for i in range(n_generations):
    
    lyrics += generator(lyrics[-100:], return_full_text = False)[0]["generated_text"]
    
print(lyrics)

#eos enf of sentence is used a  pad token"""

# Hyperparameter Tuning

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

## Imports

%load_ext tensorboard

import os
# to use or not to use GPU
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

import json
import random
import re
from collections import Counter
import tensorflow as tf
from tensorflow.keras import layers
from tensorboard.plugins.hparams import api as hp  # for hparam vis
from official.nlp import optimization 
import shutil
import numpy as np
from tensorboard import notebook
from sklearn.utils import class_weight
from sklearn.metrics import precision_recall_fscore_support
"""

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: Paths

# path to the data
save_path = "/content/drive/MyDrive/EchoCanyon/lyrics/"


# setting up logging directory
log_dir = '/content/drive/MyDrive/EchoCanyon/log_dir/'

try:
    shutil.rmtree(log_dir) # clearing logging directory <------------------
except:
    pass
    
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    """

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

# loading your own lyrics file or the one available on moodle
with open(save_path + "pop_lyrics.json", "r", encoding = "utf8") as f:
    
    lyrics_dict = json.load(f)
    
# sampling, if there are more than 5 labels
n_artists = min(5, len(lyrics_dict.keys()))
random.seed(42)
artist_sample = random.sample(list(lyrics_dict.keys()), n_artists)
lyrics_dict = {artist:lyrics for artist, lyrics in lyrics_dict.items() if artist in artist_sample}
print([key for key in lyrics_dict.keys()])
"""

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

# extracting lines and artists
lines = []
artists = []

for artist, lyrics in lyrics_dict.items():
    
    for title, lyric in lyrics.items():
        
        lyrics_lines = [re.sub("\r", "", line) for line in lyric.split("\n")]
        
        lyrics_lines = [line for line in lyrics_lines if re.search("\w", line) and line != "None"]

        for line in set(lyrics_lines):
            lines.append(line)
            artists.append(artist)

print(len(lines))
print(len(artists))
"""

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

# extracting lines and artists
lines = []
artists = []

for artist, lyrics in lyrics_dict.items():
    
    for title, lyric in lyrics.items():
        
        lyrics_lines = [re.sub("\r", "", line) for line in lyric.split("\n")]
        
        lyrics_lines = [line for line in lyrics_lines if re.search("\w", line) and line != "None"]

        for line in set(lyrics_lines):
            lines.append(line)
            artists.append(artist)

print(len(lines))
print(len(artists))
"""

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

# Selecting a smaller subset for faster training demonstration
n_lines = 27983

sample = random.sample(range(0,len(lines)), n_lines)
lines = [line for i,line in enumerate(lines) if i in sample]
artists = [artist for i,artist in enumerate(artists) if i in sample]
Counter(artists)
"""

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

# transforming to a tensorflow dataset

# inputs
inputs = tf.data.Dataset.from_tensor_slices(lines) #.repeat() #https://stackoverflow.com/questions/49531286/cannot-batch-tensors-with-different-shapes-in-component-0-with-tf-data-dataset

input_vectorizer = layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    split = "whitespace",
    output_mode='int',
    max_tokens = 2000,
    output_sequence_length = 10)

input_vectorizer.adapt(inputs)
inputs = inputs.map(input_vectorizer)

# target dict
artist_dict = {artist: i for i, artist in enumerate(list(set(artists)))}
artist_dict_rev = {i: artist for artist, i in artist_dict.items()}

# targets
artists_num = [artist_dict[artist] for artist in artists]
targets = tf.data.Dataset.from_tensor_slices(artists_num)
target_vectorizer = layers.CategoryEncoding(num_tokens = len(artist_dict), output_mode="one_hot")
targets = targets.map(target_vectorizer)

# class weights for unbalanced datasets
balanced_class_weights = dict(enumerate(class_weight.compute_class_weight('balanced',
                                                 classes = list(artist_dict_rev.keys()),
                                                 y = artists_num)))
# num of examples from 1 artist can make the model biased for that if it dominates otherwise

# zipping
dataset = tf.data.Dataset.zip((inputs, targets))

# shuffling
n_examples = tf.data.experimental.cardinality(dataset).numpy()
dataset = dataset.shuffle(n_examples, seed = 42, reshuffle_each_iteration=False)

# !!!!! Important: setting the seed does not prevent different shuffling at each epoch, 
# !!!!! set reshuffle_each_iteration=False if used

# check dataset
for input_, target in dataset.take(3):
    print(input_)
    print(target)
    print(artist_dict_rev[tf.argmax(target).numpy()])
    print("\n")

print("Class counts and weights:")
print(Counter(artists_num))
print(balanced_class_weights)

"""

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

# splitting and batching

AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
val_size = 0.1

# splitting, batching
val_size = round(val_size*n_examples)
test_dataset = dataset.take(val_size).batch(batch_size=batch_size, drop_remainder=True)
val_dataset = dataset.skip(val_size).take(val_size).batch(batch_size=batch_size, drop_remainder=True)
train_dataset = dataset.skip(val_size*2).batch(batch_size=batch_size, drop_remainder=True)

# prefetching
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)

print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))
"""

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

# defining our hyperparameter search space -> 2 methods: RealInterval, Discrete
hp_learning_rate = hp.HParam('learning_rate', hp.Discrete([0.1, 0.01, 0.001, 0.0001, 0.00001])) 
# currently possible with only two values for grid search
hp_optimizer = hp.HParam('optimizer', hp.Discrete(['adamw', 'sgd']))
hp_class_weights = hp.HParam('class_weights', hp.Discrete(['none', 'balanced']))
#hp_hidden_units = hp.HParam('hidden_units', hp.Discrete([8,32]))  # OR hp.Discrete([8, 32])

# fixed parameters and dimensions
params = {"vocab_size":input_vectorizer.vocabulary_size() +1,  # +1: Padding
            "embedding_dim": 256,
            "hidden_units": 128,
            "n_labels": len(artist_dict),
            "n_epochs": 10,
            "n_steps": 3*len(train_dataset)}


# initializing the logger
with tf.summary.create_file_writer(log_dir).as_default():
    hp.hparams_config(
    hparams=[hp_learning_rate, hp_optimizer, hp_class_weights],#, hp_hidden_units],
    metrics=[hp.Metric("accuracy", display_name='Accuracy'),
            hp.Metric("precision", display_name= "Precision"),
            hp.Metric("recall", display_name = "Recall"),
            hp.Metric("f1", display_name = "F1")],
    )  # see notes for more info aout metrics

"""


In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

# functions to run a trial with specified hyperparameters and fixed parameters

def hp_search_trial(log_dir, params, hparams):
    model = tf.keras.Sequential([
    layers.Embedding(params["vocab_size"], params["embedding_dim"], mask_zero = True), # masks zero paddings but not any other value
    #layers.Masking(mask_value = 0), -> NOT needed
    layers.GlobalAveragePooling1D(),
    layers.Dense(params["hidden_units"], activation = tf.nn.leaky_relu),
    # using Leaky ReLU: similar to ReLU with a little tweak for negative input values
    layers.Dense(params["n_labels"], activation = tf.nn.softmax)])

    if hparams[hp_optimizer] == "sgd":
        
        optimizer = tf.keras.optimizers.SGD(learning_rate=hparams[hp_learning_rate])
        
    elif hparams[hp_optimizer] == "adamw":  # updated version of adam
        
        optimizer = optimization.create_optimizer(init_lr=hparams[hp_learning_rate],
                                            num_train_steps=params["n_steps"],
                                            num_warmup_steps=round(0.1* params["n_steps"]),
                                            optimizer_type='adamw')

    class_weights = balanced_class_weights if hparams[hp_class_weights] == "balanced" else None

    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=['accuracy']
    )

    model.fit(train_dataset,validation_data=val_dataset, epochs=params["n_epochs"],
            callbacks=[tf.keras.callbacks.TensorBoard(log_dir=log_dir, 
                                                        histogram_freq=1, 
                                                        update_freq='batch')],
            class_weight = class_weights) 
    
    test_loss, test_accuracy = model.evaluate(test_dataset)
    
    true_labels = np.concatenate([y for x, y in test_dataset], axis=0).argmax(axis = -1)
    preds = model.predict(test_dataset).argmax(axis = -1)
    precision, recall, f1, support = precision_recall_fscore_support(true_labels, preds, average = "macro")
    # without avg. it will give out each class individually
    #precision, recall, f1, support = precision_recall_fscore_support(true_labels, preds)
    
    return test_accuracy, precision, recall, f1


def run(log_dir, params, hparams):
    
    with tf.summary.create_file_writer(log_dir).as_default():
        hp.hparams(hparams)  # record the values used in this trial
        
        test_accuracy, precision, recall, f1 = hp_search_trial(log_dir, params, hparams)
        tf.summary.scalar("accuracy", test_accuracy, step=1)
        # you can also log batch accuracy to see a trend, and train longer if needed
        tf.summary.scalar("precision", precision, step = 1)
        tf.summary.scalar("recall", recall, step = 1)
        tf.summary.scalar("f1", f1, step = 1)

        """

## Running Grid Search

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

session_num = 0

for optimizer in hp_optimizer.domain.values:
    for learning_rate in hp_learning_rate.domain.values:#(hp_learning_rate.domain.min_value, hp_learning_rate.domain.max_value):
        for class_weights in hp_class_weights.domain.values:
            #for hidden_units in hp_class_weights.domain.values:  # (hp_hidden_units.domain.min_value, hp_hidden_units.domain.max_value):
                hparams = {
                  hp_optimizer: optimizer,
                  hp_learning_rate: learning_rate,
                  hp_class_weights: class_weights
                  }
                run_name = f"run{session_num}_{optimizer}_lr{learning_rate}_weights={class_weights}"
                print(f'--- Starting trial: {run_name}')
                print({h.name: hparams[h] for h in hparams})
                run(f'{log_dir}{run_name}', params, hparams)
                session_num += 1
"""

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

!tensorboard --logdir logs/hparam_tuning --port 9000
"""

In [None]:
"""# FROM: 11 Hyperparameter Search Tensorboard
# CHANGES: None

# if it is not working, try another port

# listing tensorboard instances
notebook.list()

# resetting tensorboard

# Linux
#!kill #ProcessID

# Windows
#!taskkill /f /pid #ProcessID
# you also have to delete .tensorflow.info in your %TEMP% directory
"""

# Training all at once


In [None]:
for genre_index in range(1,6): 
    #with open("/content/drive/MyDrive/EchoCanyon/lyrics/blues_lyrics.json", "r", encoding = "utf8") as f:
    #with open("/content/drive/MyDrive/EchoCanyon/lyrics/country_lyrics.json", "r", encoding = "utf8") as f:
    #with open("/content/drive/MyDrive/EchoCanyon/lyrics/jazz_lyrics.json", "r", encoding = "utf8") as f:
    #with open("/content/drive/MyDrive/EchoCanyon/lyrics/metal_lyrics.json", "r", encoding = "utf8") as f:
    #with open("/content/drive/MyDrive/EchoCanyon/lyrics/pop_lyrics.json", "r", encoding = "utf8") as f:
    #with open("/content/drive/MyDrive/EchoCanyon/lyrics/rock_lyrics.json", "r", encoding = "utf8") as f:
    
    lyrics_path = "/content/drive/MyDrive/EchoCanyon/lyrics/"
    genre_names = ["blues", "country", "jazz", "metal", "pop", "rock"] 
    lyrics_path += (genre_names[genre_index] + "_lyrics.json")
    print("#############", genre_index, genre_names[genre_index], "#############")
    with open(lyrics_path, "r", encoding = "utf8") as f:

        lyrics_dict = {}
        lyrics_dict = json.load(f)

    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: None



    # reformatting lyrics

    lyrics = {"text":[], "artist":[]}
    for artist, titles in lyrics_dict.items():
        
        for title, text in titles.items():
            
            lyrics["artist"].append(artist)
            lyrics["text"].append(text)

    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: None



    # AutoTokenizer will find and load the right tokenizer for all common models in the huggingface library
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    i = random.randint(0, len(lyrics["text"]))
    example_text = lyrics["text"][i][0:200]
    example_text_tokenized = tokenizer(example_text)
    print([tokenizer.decode(token) for token in example_text_tokenized["input_ids"]])



    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: None

    # transforming lyrics into dataset format

    dataset = Dataset.from_dict(lyrics)
    print(dataset)



    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: None

    # tokenizing before chunking, to ensure equal lengths

    def pretokenization(text, tokenizer):
        tokens = tokenizer(text, truncation = False, add_special_tokens = False, return_attention_mask = False)
        return {"tokens":[tokenizer.decode(token) for token in tokens["input_ids"]]}

    tokenized_dataset = dataset.map(lambda x: pretokenization(x["text"],tokenizer), remove_columns = ["text"])
    print(tokenized_dataset)
    i = random.randint(0, len(dataset["text"]))
    print(tokenized_dataset["tokens"][i][0:20])



    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: None

    # chunking the lyrics into parts of equal length
    # greater chunks allow longer-range dependencies but also increase computation 
    chunk_size = 25  # increases computation quadraticly, basically
    # depens on how many close words to consider during training over the current word 
    #

    def chunking(examples, chunk_size):
        chunks = []
        artists = []
        for tokens, artist in zip(examples["tokens"], examples["artist"]):

            for i in range(0, len(tokens)-chunk_size, chunk_size):
                
                chunks.append(tokens[i:i+chunk_size])
                artists.append(artist)
        
        return {'tokens': chunks, "artist": artists}

    chunked_dataset = tokenized_dataset.map(lambda x: chunking(x, chunk_size), batched = True, 
                                            remove_columns = tokenized_dataset.column_names)

    # check if all chunks are of equal length
    print(max(len(ids) for ids in chunked_dataset["tokens"]))
    print(min(len(ids) for ids in chunked_dataset["tokens"]))

    print(chunked_dataset)



    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: None

    # stratifying by artist

    n = Counter(chunked_dataset["artist"]).most_common()[-1][1]  # from library->Collections

    def stratifying(examples, n):  # to have same amount of lines from both artists
        
        chunks = []
        artists = []
        
        for artist in set(examples["artist"]):
            
            artist_chunks = [chunk for a, chunk in zip(examples["artist"], examples["tokens"]) if a == artist]
            artist_chunks = random.sample(artist_chunks, n)
            
            for chunk in artist_chunks:
                
                chunks.append(chunk)
                artists.append(artist)
            
            
        return {'tokens': chunks, "artist": artists}
        
    stratified_dataset = chunked_dataset.map(lambda x: stratifying(x, n), batched = True, batch_size = None,
                                            remove_columns = tokenized_dataset.column_names)

    print(stratified_dataset)
    print(Counter(stratified_dataset["artist"]).most_common())



    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: None

    # running tokenization again, to retrieve input_ids, labels and the attention_masks

    def tokenization(tokens):
        tokenized = tokenizer(tokens)
        return  {"input_ids": tf.squeeze(tokenized["input_ids"]),
                # duplicating the inputs for our labels
                # "the model of the 🤗 Transformers library apply the shifting to the right, so we don't need to do it manually"
                "labels": tf.squeeze(tokenized["input_ids"]), 
                "attention_mask": tf.squeeze(tokenized["attention_mask"])}

    final_dataset = stratified_dataset.map(lambda x:tokenization(x["tokens"]))
    print(final_dataset)



    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: None

    # shuffling, batching, splitting  # needed when using tf and it needs to be transfored to a Tensorflow Dataset
    from transformers import DefaultDataCollator

    batch_size = 2
    n_batches = round(final_dataset.num_rows/batch_size)
    n_eval_batches = round(0.1*n_batches)
    n_train_batches = n_batches - n_eval_batches


    # transforming to a tensorflow dataset
    data_collator = DefaultDataCollator(return_tensors= "tf")

    tf_dataset = final_dataset.to_tf_dataset(columns=["attention_mask", "input_ids", "labels"],
                                                                        shuffle = True, 
                                                                        batch_size = batch_size,
                                                                        collate_fn = data_collator)

    # !!!!! Important: setting the seed does not prevent different shuffling at each epoch, 
    # !!!!! set reshuffle_each_iteration=False if used


    tf_train_dataset = tf_dataset.take(n_train_batches)
    tf_eval_dataset = tf_dataset.skip(n_train_batches).take(n_eval_batches)

    print(tf_train_dataset)



    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: None

    # inspecting the configuration of the pretrained model
    config = AutoConfig.from_pretrained(model_name)

    # for more elaboration of all the configuration details check here:
    # https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Config
    # print(config)

    # "n_ctx": 1024, sequence dim
    # "n_embd": 768, embedding dim
    # "n_head": 12, num of attetion heads



    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: None

    # loading the model including its causal language modeling head

    model = TFAutoModelForCausalLM.from_pretrained(model_name)



    # FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
    # CHANGES: Hyperparameter Tuning see next section

    # initializing the optimizer and defining our learning rate schedule

    epochs = 50
    num_train_steps = epochs * n_train_batches
    warmup_ratio = 0.1
    num_warmup_steps = int(warmup_ratio*num_train_steps)

    init_lr = 1e-3

    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                              num_train_steps=num_train_steps,
                                              num_warmup_steps=num_warmup_steps,
                                              optimizer_type='adamw')

    # compiling the model
    model.compile(optimizer=optimizer)



    history = model.fit(
        tf_train_dataset,
        validation_data=tf_eval_dataset,
        epochs = epochs)
        


    # All the above cells are run 6 times with the same hyperparameters that are obtained 
    # using the pop_genre as context data (with most lines) to save time and have 
    # comparable results of syntax and semantics

    # from https://huggingface.co/docs/transformers/model_sharing

    model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/"
    genre_names = ["blues", "country", "jazz", "metal", "pop", "rock"]
    #print("#############", genre_index, genre_names[genre_index], "#############")
    model_path += genre_names[genre_index]

    #model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/blues"
    #model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/country"
    #model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/jazz"
    #model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/metal"
    #model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/pop" 
    #model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/rock"

    model.save_pretrained(model_path)

# Genre Specific Generators for Usage


In [None]:
# Choose Genre
genre = 4
model_path = "/content/drive/MyDrive/EchoCanyon/saved_model/"
genre_names = ["blues", "country", "jazz", "metal", "pop", "rock"]  # count from 0
model_path += genre_names[genre]
model_path

In [None]:
model = TFAutoModelForCausalLM.from_pretrained(model_path)



In [None]:
# FROM: 10 Finetuning a Language Model with Huggingface - Lyrics Mashup
# CHANGES: Redefining tokenizer and generator to match the genre model

tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = pipeline('text-generation',model=model, tokenizer=tokenizer) 

#prompt = "Peace sells, but who is buying"  
prompt = "Como esta" 

# Too random with 1-2 words, doesnt get the context
# Performs better with one sentence
# Too restrictive and causes repetititions with a paragraph

n_generations = 5 # runs 

lyrics = prompt

for i in range(n_generations):
    
    lyrics += generator(lyrics[-100:], return_full_text = False)[0]["generated_text"]
    
print(lyrics)

#eos enf of sentence is used a  pad token


In [None]:
lines = lyrics.split("\n")
#words = ["" for i in range(len(lines))]
#for count, line in enumerate(lines):
#    words[count] = line.split(" ")

In [None]:
lines = lines[:-1]

In [None]:
lyrics = '\n'.join(lines)

In [None]:
lyrics

In [None]:
from os.path import exists
import shutil

i = 0
path = '/content/drive/MyDrive/EchoCanyon/gen_lyrics/lyrics_current.txt'

while exists(path):
    i += 1
    path = '/content/drive/MyDrive/EchoCanyon/gen_lyrics/lyrics_' + str(i) + '.txt'  
    if not exists(path):
        original = '/content/drive/MyDrive/EchoCanyon/gen_lyrics/lyrics_current.txt'
        target = path
        shutil.copyfile(original, target)
        break

with open('/content/drive/MyDrive/EchoCanyon/gen_lyrics/lyrics_current.txt', 'w') as f:
    f.write(lyrics)   