<a href="https://colab.research.google.com/github/xjseabrum/lyrics-nlp-acoustic-predictions/blob/main/Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Statements and Setup


In [None]:
!python --version

Python 3.7.13


In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
import os as os
os.chdir("/content/gdrive/MyDrive/lyrics-nlp")

In [None]:
!pip3 install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext==0.9.2
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 3.2 MB/s 
[?25hCollecting gensim==4.1.2
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 17 kB/s 
[?25hCollecting lyricsgenius==3.0.1
  Downloading lyricsgenius-3.0.1-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.7 MB/s 
[?25hCollecting matplotlib==3.5.2
  Downloading matplotlib-3.5.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)
[K     |████████████████████████████████| 11.2 MB 27.4 MB/s 
[?25hCollecting nltk==3.5
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 50.2 MB/s 
Collecting seaborn==0.11.1
  Downloading seaborn-0.11.1-py3-none-any.whl (285 kB)
[K     |████████████████████████

In [None]:
from utils import count_words, proportion_unique_words
from utils import strip_punctuation
import pandas as pd
import numpy as np
import pickle5 as pickle
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
import torch
from sentence_transformers import SentenceTransformer
from transformers import LongformerConfig, LongformerModel, LongformerTokenizer
from transformers import Trainer, TrainingArguments

# Helper Function Definitions

In [None]:
# Prevent (excessive) scientific notation from being
# printed to console
np.set_printoptions(suppress=True,
   formatter={'float_kind':'{:.5f}'.format})

In [None]:
def get_sentence_transformer_embedding(lyrics:list):
  encoded_inputs = []
  for lyric in lyrics:
    # Sentence transformers are limited.  From this documentation:
    # https://www.sbert.net/examples/applications/computing-embeddings/README.html
    # `` By default, the provided methods use a limit fo 128 word pieces, 
    # longer inputs will be truncated. [...] Note: You cannot increase the 
    # length higher than what is maximally supported by the respective 
    # transformer model. Also note that if a model was trained on short texts, 
    # the representations for long texts might not be that good.``
    # With this in mind, the max sequence length will be set to 512, 
    # as that is the most that BERT and other transformers have done:
    sent_transformer.max_seq_length = 512
    encoded_lyric = sent_transformer.encode(lyric)
    encoded_inputs.append(encoded_lyric)
  return np.array(encoded_inputs)

In [None]:
def get_longformer_embedding(lyrics:list):
  encoded_inputs = []
  for lyric in lyrics:
    encoded_input = tokenizer(lyric, return_tensors = "pt", max_length = 1024,
                              truncation = True, padding = "max_length")
    output = longformer(**encoded_input, output_hidden_states = True)
    # Now, the following looks awful (and it is) but essentially this is 
    # what is happening:
    # Take the last layer's output of the longformer [0]
    # Take the [:, 0, :]th slice to get the ``sentence embedding``
    # which ``represents`` the CLS token 
    encoded_lyric = output[0][:, 0, :].detach().numpy()
    # Note that the embeddings might not be meaningful (ie cosine
    # similarity between sentences.) This is limitation is noted in the original 
    # BERT paper and is also a limitation for the longformer.
    encoded_inputs.append(encoded_lyric)
  return np.array(encoded_inputs)

In [None]:
def get_response_vectors(resp:pd.DataFrame):
  response_vectors = []
  for item in range(len(resp)):
    response_vectors.append( np.array( resp.iloc[item] ) )
  return np.array(response_vectors)

In [None]:
# Pickle.
def save_vects(vect:np.array, name:str) -> None:
  filename = name + ".pkl"
  if not os.path.exists(filename):
    with open(filename, "wb") as out:
      pickle.dump(vect, out, pickle.HIGHEST_PROTOCOL)
  else:
    print(f"File {filename} already exists in the directory!"
    f" Delete {filename} before attempting to save again.")

In [None]:
# Load pickle
def load_vects(name:str):
  filename = name + ".pkl"
  return pickle.load(open(filename, "rb"))

In [None]:
# For implementing checkpointing
def set_up_checkpoint(checkpoint_name):
  return ModelCheckpoint(checkpoint_name, 
                         monitor = "val_acc",
                         mode = 'max',
                         save_best_only = True,
                         save_weights_only = True, 
                         verbose = 1)

# Load in the Data

In [None]:
x_train = pd.read_csv("data/05_x_train.csv")
x_valid = pd.read_csv("data/05_x_valid.csv")
x_test = pd.read_csv("data/05_x_test.csv")

y_train = pd.read_csv("data/05_y_train.csv")
y_valid = pd.read_csv("data/05_y_valid.csv")
y_test = pd.read_csv("data/05_y_test.csv")

In [None]:
x_train["prop_unique"] = x_train["lyrics"].apply(proportion_unique_words)
x_train["lyrics"] = strip_punctuation(x_train["lyrics"], for_transformers = True)
x_train["lyrics"] = x_train["lyrics"].str.lower()

x_valid["prop_unique"] = x_valid["lyrics"].apply(proportion_unique_words)
x_valid["lyrics"] = strip_punctuation(x_valid["lyrics"], for_transformers = True)
x_valid["lyrics"] = x_valid["lyrics"].str.lower()

In [None]:
# Sentence Transformer
# all-mpnet-base-v2 is the best overall performing sent-transformer
# model according to the documentation:
# sbert.net/docs/pretrained_models.html#sentence-embedding-models/
sent_transformer = SentenceTransformer("all-mpnet-base-v2")

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# Longformer
# https://huggingface.co/docs/transformers/model_doc/longformer
longformer = LongformerModel.from_pretrained("allenai/longformer-base-4096")
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096", 
                                                max_length = 1024)

Downloading config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/570M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

# Get the Embeddings

In [None]:
# LF takes forever (~18min for 610 lyrics). This is to be expected.
# TODO: change the loop above to take care of the double nesting of the 
# LF emb output.

x_train_st_emb = get_sentence_transformer_embedding(list(x_train["lyrics"]))
x_train_lf_emb = get_longformer_embedding(list(x_train["lyrics"]))

x_valid_st_emb = get_sentence_transformer_embedding(list(x_valid["lyrics"]))
x_valid_lf_emb = get_longformer_embedding(list(x_valid["lyrics"]))

In [None]:
y_train_np = get_response_vectors(y_train)
y_valid_np = get_response_vectors(y_valid)

In [None]:
save_vects(x_train_st_emb, "x_train_st_emb")
save_vects(x_train_lf_emb, "x_train_lf_emb")
save_vects(x_valid_st_emb, "x_valid_st_emb")
save_vects(x_valid_lf_emb, "x_valid_lf_emb")
save_vects(y_train_np, "y_train_np")
save_vects(y_valid_np, "y_valid_np")

File x_train_st_emb.pkl already exists in the directory. Delete x_train_st_emb.pkl before attempting to save again.
File x_train_lf_emb.pkl already exists in the directory. Delete x_train_lf_emb.pkl before attempting to save again.
File x_valid_st_emb.pkl already exists in the directory. Delete x_valid_st_emb.pkl before attempting to save again.
File x_valid_lf_emb.pkl already exists in the directory. Delete x_valid_lf_emb.pkl before attempting to save again.
File y_train_np.pkl already exists in the directory. Delete y_train_np.pkl before attempting to save again.
File y_valid_np.pkl already exists in the directory. Delete y_valid_np.pkl before attempting to save again.


# Make the model

In [None]:
# Load in the embeddings if they aren't already in the environment.
x_train_st_emb = load_vects("x_train_st_emb")
x_train_lf_emb = load_vects("x_train_lf_emb")
x_valid_st_emb = load_vects("x_valid_st_emb")
x_valid_lf_emb = load_vects("x_valid_lf_emb")
y_train_np = load_vects("y_train_np")
y_valid_np = load_vects("y_valid_np")

In [None]:
ST_EMB_SIZE = len(x_train_st_emb[0])
LF_EMB_SIZE = len(x_train_lf_emb[0])

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.initializers import Constant
from keras import layers, Input, Model, Sequential

In [None]:
N_EPOCHS = 64
N_DIM = len(x_train_st_emb[0])
OBJ_FUNC = "mse"
OPTIMIZER = "adam"
METRICS = ["mae"]
BATCH_SIZE = 1
HIDDEN_ACTIVATION = 'relu'
EARLY_STOP = EarlyStopping(monitor = f"val_{METRICS[0]}", patience = 4, verbose = 1)

In [None]:
# For implementing checkpointing
def set_up_checkpoint(checkpoint_name):
  return ModelCheckpoint(checkpoint_name, 
                         monitor = f"val_{METRICS[0]}",
                         save_best_only = True,
                         save_weights_only = True, 
                         verbose = 1)

In [None]:
CKPT = set_up_checkpoint("first_run_dense_dense_2022_08_19")

In [None]:
X = tf.convert_to_tensor(x_train_st_emb)
Y = tf.convert_to_tensor(y_train_np)

In [None]:
int_seq_input = Input(shape = (None, BATCH_SIZE, N_DIM), dtype = "float32")
layer1 = Dense(20, activation = 'relu')(int_seq_input)
layer2 = Dense(10, activation = 'relu')(layer1)
out = Dense(7)(layer2)
model = Model(int_seq_input, out)
model.compile(loss = OBJ_FUNC, optimizer = OPTIMIZER, metrics = METRICS)
model.layers[0].trainable = True
model.summary()

Model: "model_52"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_59 (InputLayer)       [(None, None, 1, 768)]    0         
                                                                 
 dense_163 (Dense)           (None, None, 1, 20)       15380     
                                                                 
 dense_164 (Dense)           (None, None, 1, 10)       210       
                                                                 
 dense_165 (Dense)           (None, None, 1, 7)        77        
                                                                 
Total params: 15,667
Trainable params: 15,667
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X, Y, batch_size = BATCH_SIZE, 
          epochs = N_EPOCHS, validation_data = (x_valid_st_emb, y_valid_np), 
          callbacks = [EARLY_STOP, CKPT])

Epoch 1/64









Epoch 1: val_mae improved from inf to 0.71229, saving model to first_run_dense_dense_2022_08_19
Epoch 2/64
Epoch 2: val_mae did not improve from 0.71229
Epoch 3/64
Epoch 3: val_mae did not improve from 0.71229
Epoch 4/64
Epoch 4: val_mae did not improve from 0.71229
Epoch 5/64
Epoch 5: val_mae improved from 0.71229 to 0.70895, saving model to first_run_dense_dense_2022_08_19
Epoch 6/64
Epoch 6: val_mae improved from 0.70895 to 0.70131, saving model to first_run_dense_dense_2022_08_19
Epoch 7/64
Epoch 7: val_mae improved from 0.70131 to 0.69741, saving model to first_run_dense_dense_2022_08_19
Epoch 8/64
Epoch 8: val_mae improved from 0.69741 to 0.69096, saving model to first_run_dense_dense_2022_08_19
Epoch 9/64
Epoch 9: val_mae did not improve from 0.69096
Epoch 10/64
Epoch 10: val_mae did not improve from 0.69096
Epoch 11/64
Epoch 11: val_mae did not improve from 0.69096
Epoch 12/64
Epoch 12: val_mae did not improve from 0.69096
Epoch 12: early stopping


<keras.callbacks.History at 0x7f1bab4af790>

In [None]:
y_test_np = np.array(y_test)
x_test_st_emb = get_sentence_transformer_embedding(x_test)

In [None]:
y_test_hat = model.predict(x_test_st_emb, verbose = 0)

In [None]:
y_test_hat[1]

array([0.26675, 0.56818, 0.20460, -5.52965, -0.97212, -1.31918, 0.23921],
      dtype=float32)

In [None]:
y_test_np[1]

array([0.25800, 0.85300, 0.69800, -4.36607, -1.64506, -3.10776, 0.53100])