In [14]:
# connect to our drive; to grab our data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
%%capture
!pip install sentence-transformers langdetect lyricsgenius

In [16]:
# grab basic libraries
import shutil
from pathlib import Path
import pickle
import json
import csv
import time
import lyricsgenius as lg
import re
import numpy as np
import pandas as pd

#import credential
import requests

In [17]:
# check out current directory
!ls ./drive/MyDrive/team-spot-a-mood/
# Path.cwd()

analysis	   garage_model_train_finetune	  preprocessing.ipynb
app.py		   get_embeddings.ipynb		  README.md
assets		   get_lyrics.ipynb		  requirements.txt
charts.csv	   get_ranked_songs_lyrics.ipynb  songsdata
garage_embeddings  pickle_objects


In [18]:
# directory where pickle object is
pickle_dir = Path.cwd().joinpath('./drive/MyDrive/team-spot-a-mood/pickle_objects/')
lyrics_annotation_pickle = pickle_dir.joinpath('lyrics_annotation_pairs.obj')
lyrics_annotation_pickle

PosixPath('/content/drive/MyDrive/team-spot-a-mood/pickle_objects/lyrics_annotation_pairs.obj')

In [19]:
# grab its content; it's a list object
lyr_ann_dataset = pd.read_pickle(lyrics_annotation_pickle)


In [20]:
# 1st element of list contains sublist of tuples; for fine-tuning BERT
lyr_ann_dataset[0][0]    #quick look at first element

('With auburn hair and tawny eyes',
 'Both ‘auburn’ and ‘tawny’ mean reddish-orange hair or eyes respectively. From the dancers in the music video, you can gain a clearer image of the woman.')

### Grab Sentence Transformers Model

In [21]:
from sentence_transformers import SentenceTransformer, models

## Step 1: use an existing language model
word_embedding_model = models.Transformer('sentence-transformers/all-distilroberta-v1')

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

## Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

### Prepare Dataset for Training a Sentence Transformers Model

In [22]:
from sentence_transformers import InputExample

train_examples = []
train_data = lyr_ann_dataset[0]
n_examples = len(lyr_ann_dataset[0])

for i in range(n_examples):
  lyrics, annotation = train_data[i]
  if len(lyrics.strip()) > 0 and len(annotation.strip()) > 0: 
    train_examples.append(InputExample(texts=[lyrics, annotation]))

In [23]:
# quick print
print(f"We have a {type(train_examples)} of length {len(train_examples)} containing {type(train_examples[0])}'s.")

We have a <class 'list'> of length 38684 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


### Wrap Training Dataset into a Python `Dataloader`

In [24]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

### Get Loss Function for Training a Sentence Transformers Model

In [25]:
from sentence_transformers import losses

train_loss = losses.MultipleNegativesRankingLoss(model=model)

##TRAIN The Model

In [26]:
num_epochs = 10
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

In [27]:
# this is the actual fine-tuning
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps) 

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2418 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2418 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2418 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2418 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2418 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2418 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2418 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2418 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2418 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2418 [00:00<?, ?it/s]

In [28]:
# save fine-tuned model to Google Drive
model_save_path = str(pickle_dir.joinpath('ft_model'))
model.save(model_save_path)

In [29]:
model_save_path

'/content/drive/MyDrive/team-spot-a-mood/pickle_objects/ft_model'