In [1]:
! pip install transformers



In [2]:
import pandas as pd
import pyarrow
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [3]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_feather('/content/drive/My Drive/MSBA/Unstructured/unstructured-notes/text-classification/all_lyrics_11_24.feather')
df = df.dropna(subset=['genre'])

In [8]:
df['genre'] = pd.factorize(df['genre'])[0]

df.head()

Unnamed: 0,artist,song,lyrics,searched_song,returned_song,searched_artist,returned_artist,lyric_link,original_link,week,genre
0,The Black Keys,Tighten Up,"[Whistling][Verse 1]I wanted love, I needed lo...",Tighten Up,Tighten Up,The Black Keys,The Black Keys,https://genius.com/The-black-keys-tighten-up-l...,https://www.billboard.com/charts/rock-songs/20...,2011-01-01,0
1,Linkin Park,Waiting For The End,"[Intro: Mike Shinoda]Yeah, yoThis is not the e...",Waiting For The End,Waiting for the End,Linkin Park,Linkin Park,https://genius.com/Linkin-park-waiting-for-the...,https://www.billboard.com/charts/rock-songs/20...,2011-01-01,0
2,Three Days Grace,World So Cold,[Verse 1]I never thought I'd feel thisGuilty a...,World So Cold,World So Cold,Three Days Grace,Three Days Grace,https://genius.com/Three-days-grace-world-so-c...,https://www.billboard.com/charts/rock-songs/20...,2011-01-01,0
3,Stone Sour,Say You'll Haunt Me,[Verse 1]Little supernovas in my headLittle so...,Say You'll Haunt Me,Say You’ll Haunt Me,Stone Sour,Stone Sour,https://genius.com/Stone-sour-say-youll-haunt-...,https://www.billboard.com/charts/rock-songs/20...,2011-01-01,0
4,Neon Trees,Animal,[Verse 1]Here we go againI kinda wanna be more...,Animal,Animal,Neon Trees,Neon Trees,https://genius.com/Neon-trees-animal-lyrics,https://www.billboard.com/charts/rock-songs/20...,2011-01-01,0


In [9]:
df['genre'].unique()

array([0, 1, 2, 3, 4, 5, 6])

In [10]:
df['genre'].value_counts()

Unnamed: 0_level_0,count
genre,Unnamed: 1_level_1
0,4616
5,4223
6,3774
3,3260
4,3099
1,2519
2,2486


In [11]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [None]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        lyrics = str(self.data.lyrics[index])
        lyrics = " ".join(lyrics.split())
        inputs = self.tokenizer.encode_plus(
            lyrics,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = 'max_length',
            return_token_type_ids = True,
            truncation = True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype = torch.long),
            'mask': torch.tensor(mask, dtype = torch.long),
            'targets': torch.tensor(self.data.genre[index], dtype = torch.long)
        }

    def __len__(self):
        return self.len

In [None]:
train_size = 0.8
train_dataset = df.sample(frac = train_size, random_state = 200)
test_dataset = df.drop(train_dataset.index).reset_index(drop = True)
train_dataset = train_dataset.reset_index(drop = True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (23977, 11)
TRAIN Dataset: (19182, 11)
TEST Dataset: (4795, 11)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on
# top of distillBert to get the final output for the model.

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, 7)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids = input_ids, attention_mask = attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistillBERTClass()
model.to(device)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

Training Loss per 5000 steps: 1.934432029724121
Training Accuracy per 5000 steps: 25.0
The Total Accuracy for Epoch 0: 49.14503180064644
Training Loss Epoch: 1.28545648650615
Training Accuracy Epoch: 49.14503180064644
Training Loss per 5000 steps: 0.9686891436576843
Training Accuracy per 5000 steps: 62.5
The Total Accuracy for Epoch 1: 70.79032426232926
Training Loss Epoch: 0.7024176742437236
Training Accuracy Epoch: 70.79032426232926
Training Loss per 5000 steps: 0.7963147163391113
Training Accuracy per 5000 steps: 62.5
The Total Accuracy for Epoch 2: 76.30069857157753
Training Loss Epoch: 0.5498104779596585
Training Accuracy Epoch: 76.30069857157753
Training Loss per 5000 steps: 0.5005574822425842
Training Accuracy per 5000 steps: 75.0
The Total Accuracy for Epoch 3: 79.0011469085601
Training Loss Epoch: 0.4670353876760818
Training Accuracy Epoch: 79.0011469085601
Training Loss per 5000 steps: 0.4539134204387665
Training Accuracy per 5000 steps: 75.0
The Total Accuracy for Epoch 4: 8

In [None]:
def valid(model, testing_loader):
    model.eval()
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    n_wrong = 0
    total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu

In [None]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

Validation Loss per 100 steps: 0.24444274604320526
Validation Accuracy per 100 steps: 100.0
Validation Loss Epoch: 0.4439601403557862
Validation Accuracy Epoch: 79.66631908237747
Accuracy on test data = 79.67%


In [None]:
torch.save(model, '/content/drive/MyDrive/genre_model.pth')

In [None]:
model = torch.load('/content/drive/MyDrive/genre_model.pth')
model.eval()

  model = torch.load('/content/drive/MyDrive/genre_model.pth')


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

In [None]:
genre_label = {0: "adult-pop",
                 1: "christian",
                 2: "country",
                 3: "hot-alternative",
                 4: "hot-hard-rock",
                 5: "pop",
                 6: "rock"}

def predict(model, testing_loader):
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask)

    return outputs

In [None]:
ex_text_str = '''
Five, six
Five, six, seven, eight
I could be the one, or your new addiction
It's all in my head but I want non-fiction
I don't want the world, but I'll take this city
Who can blame a girl? Call me hot, not pretty
Baby, do you like this beat? (Na-na-na-na, na)
I made it so you'd dance with me (na-na-na-na, na)
It's like a hundred 99 degrees (na-na-na-na, na)
When you're doing it with me, doing it with me
H-O-T-T-O-G-O
Snap and clap and touch your toes
Raise your hands, now body roll
Dance it out, you're hot to go
H-O-T-T-O-G-O
Snap and clap and touch your toes
Raise your hands, now body roll
H-O-T-T-O-G-O
H-O-T-T-O-G-O
You can take me hot to go
H-O-T-T-O-G-O
You can take me hot to go
Well, I woke up alone staring at my ceiling
I try not to care but it hurts my feelings
You don't have to stare, come here, get with it
No one's touched me there in a damn hot minute
And baby, don't you like this beat? (Na-na, na-na-na)
I made it so you'd sleep with me (na-na, na-na-na)
It's like a hundred 99 degrees (na-na, na-na-na)
When you're doing it with me, doing it with me
H-O-T-T-O-G-O
Snap and clap and touch your toes
Raise your hands, now body roll
Dance it out, you're hot to go
H-O-T-T-O-G-O
Snap and clap and touch your toes
Raise your hands, now body roll
H-O-T-T-O-G-O
H-O-T-T-O-G-O
You can take me hot to go
H-O-T-T-O-G-O
You can take me hot to go
What's it take to get your number? What's it take to bring you home?
Hurry up, it's time for supper, order up, I'm hot to go
What's it take to get your number? Hurry up, it's getting cold
Hurry up, it's time for supper, order up, I'm hot to go
H-O-T-T-O-G-O
You can take me hot to go (oh, yeah)
H-O-T-T-O-G-O
You can take me hot to go (hot to go)
H-O-T-T-O-G-O
You can take me hot to go (oh, yeah)
H-O-T-T-O-G-O
You can take me hot to go
Whew, it's hot here, is anyone else hot?
Whoo, you coming home with me?
Okay, it's hot, I'll call the cab
'''

new_data = pd.DataFrame(data = {'lyrics': [ex_text_str], 'genre': 2})

new_set = Triage(new_data, tokenizer, MAX_LEN)

In [None]:
new_loader = DataLoader(new_set, **test_params)

model = model.to(device)

In [None]:
new_loader.dataset[0]

{'ids': tensor([  101,  4222,   117,  1565,  4222,   117,  1565,   117,  1978,   117,
          2022,   146,  1180,  1129,  1103,  1141,   117,  1137,  1240,  1207,
         15658,  1135,   112,   188,  1155,  1107,  1139,  1246,  1133,   146,
          1328,  1664,   118,  4211,   146,  1274,   112,   189,  1328,  1103,
          1362,   117,  1133,   146,   112,  1325,  1321,  1142,  1331,  2627,
          1169,  7338,   170,  1873,   136,  7268,  1143,  2633,   117,  1136,
          2785,  6008,   117,  1202,  1128,  1176,  1142,  3222,   136,   113,
         11896,   118,  9468,   118,  9468,   118,  9468,   117,  9468,   114,
           146,  1189,  1122,  1177,  1128,   112,   173,  2842,  1114,  1143,
           113,  9468,   118,  9468,   118,  9468,   118,  9468,   117,  9468,
           114,  1135,   112,   188,  1176,   170,  2937,  4850,  4842,   113,
          9468,   118,  9468,   118,  9468,   118,  9468,   117,  9468,   114,
          1332,  1128,   112,  1231,  1833,  

In [None]:
prediction = predict(model, new_loader)

In [None]:
prediction.argmax().item()

3

In [None]:
print("This is a %s song" %genre_label[prediction.argmax().item()])

This is a hot-alternative song
