In [None]:
!pip install transformers

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv("drive/MyDrive/ColabFiles/spotify_millsongdata.csv")
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [None]:
lyrics = data.text
len(lyrics)

57650

In [None]:
lyrics_data = [str(song).replace("\r", "").replace("\n", "") for song in lyrics]
lyrics_data[0]

"Look at her face, it's a wonderful face  And it means something special to me  Look at the way that she smiles when she sees me  How lucky can one fellow be?    She's just my kind of girl, she makes me feel fine  Who could ever believe that she could be mine?  She's just my kind of girl, without her I'm blue  And if she ever leaves me what could I do, what could I do?    And when we go for a walk in the park  And she holds me and squeezes my hand  We'll go on walking for hours and talking  About all the things that we plan    She's just my kind of girl, she makes me feel fine  Who could ever believe that she could be mine?  She's just my kind of girl, without her I'm blue  And if she ever leaves me what could I do, what could I do?"

In [None]:
tokeniser = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def encode(corpus):
    encoded = tokeniser.batch_encode_plus(corpus, max_length=128,
                                         add_special_tokens=True,
                                         return_attention_mask=True,
                                         truncation=True,
                                         return_tensors='pt',
                                         padding='max_length')

    return encoded['input_ids'], encoded['attention_mask']

song_ids, song_masks = encode(lyrics_data)

In [None]:
classifier = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                          num_labels=6,
                                                          output_attentions=False,
                                                          output_hidden_states=False)

classifier.load_state_dict(torch.load('drive/MyDrive/ColabFiles/finalsmalldict.zip'))

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [None]:
tensored = TensorDataset(song_ids, song_masks)
sampler = SequentialSampler(tensored)
dataloader = DataLoader(tensored, sampler=sampler, batch_size=16)

In [None]:
song_pred = []
emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

with torch.no_grad():
    for data in tqdm(dataloader):
        ids, mask = [x.to("cpu") for x in data]
        output = classifier(input_ids=ids, attention_mask=mask)

        labels = output.logits.cpu().detach().numpy()
        
        for song in labels:
            emotions = []
            for i in range(len(song)):
                if song[i] > 0:
                    emotions.append(emotion_labels[i])
            song_pred.append(emotions)

song_pred

  0%|          | 0/3604 [00:00<?, ?it/s]

[['joy'],
 ['love'],
 ['sadness'],
 ['love'],
 ['love'],
 ['anger'],
 ['sadness'],
 ['sadness', 'joy', 'love', 'fear'],
 ['surprise'],
 ['anger'],
 ['sadness', 'love', 'fear'],
 ['love', 'anger'],
 ['sadness'],
 ['surprise'],
 ['joy'],
 ['sadness'],
 ['surprise'],
 ['love'],
 ['joy', 'love', 'anger'],
 ['joy', 'love', 'anger', 'surprise'],
 ['sadness'],
 ['surprise'],
 ['sadness'],
 ['sadness', 'love'],
 ['love'],
 ['anger'],
 ['joy'],
 ['sadness'],
 ['sadness'],
 ['joy', 'anger', 'fear'],
 ['love', 'fear'],
 ['sadness'],
 ['sadness'],
 ['fear', 'surprise'],
 ['surprise'],
 ['surprise'],
 ['sadness'],
 ['fear', 'surprise'],
 ['surprise'],
 ['sadness', 'love'],
 ['sadness'],
 ['sadness'],
 ['fear'],
 ['love', 'anger'],
 ['surprise'],
 ['joy', 'surprise'],
 ['love'],
 ['surprise'],
 ['love'],
 ['joy'],
 ['love', 'anger'],
 ['joy'],
 ['anger'],
 ['love'],
 ['sadness'],
 ['love'],
 ['sadness', 'love', 'anger'],
 ['sadness', 'anger'],
 ['anger'],
 ['surprise'],
 ['sadness'],
 ['love'],
 ['s

In [None]:
f = open("drive/MyDrive/ColabFiles/song_labels.txt", "w")
f.writelines([f"{str(line)}\n" for line in song_pred])
f.close()