## The NLP part

In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import itertools
from collections import Counter

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

from tqdm.auto import tqdm, trange

# from torch.utils.tensorboard import SummaryWriter
# import torch.nn.functional as F

# for evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [2]:
# set random state
rs = 42

In [3]:
# import nltk
# nltk.download('stopwords')

In [21]:
class Corpus:
    def __init__(self):
        self.tokenizer = RegexpTokenizer(r'\w+')

        self.word_to_index = {} # word to unique-id
        self.index_to_word = {} # unique-id to word

        self.word_counts = Counter()

        self.labels = []
        self.output = []

    def tokenize(self, text):
        return self.tokenizer.tokenize(text)
        
    def load_data(self, file_name, min_token_freq):

        # Step 1: Read the file
        print('Reading data and tokenizing')

        raw_data = pd.read_csv(file_name)
#         raw_data = raw_data[raw_data['lyrics'].notna()]
        raw_x = raw_data['lyrics'].to_list()
        raw_labels = raw_data['data.album.tracks.items.track.playcount'].to_list()
        self.labels = np.log(raw_labels)
        
        all_tokens = []
        cachedStopWords = stopwords.words('english')
        
        for track in raw_x:
            
            track_re = re.sub("[\(\[].*?[\)\]]", "", track)
            token = self.tokenize(track_re)
            token_without_sw = [word for word in token if not word in cachedStopWords]
            token_without_sw_l = [word.lower() for word in token_without_sw]
            all_tokens.append(token_without_sw_l)
            
    
        # Step 2: Count how many tokens we have of each type
        print('Counting token frequencies')
        freqs = Counter(list(itertools.chain.from_iterable(all_tokens)))
        
        # Step 3: Remove low occurrence
#         Replace all tokens below the specified frequency with an <UNK> and then remove them

        print("Performing minimum thresholding")

        for track in all_tokens:
            for index in range(len(track)):
                if freqs[track[index]] < min_token_freq:
                    track[index] = '<UNK>'

                
        for track in all_tokens:
            t = []
            for word in track:
                if word != '<UNK>':
                    t.append(word)
                
            self.output.append(t)
    
        # Step 4: update self.word_counts to be the number of times each word

        self.word_counts = Counter(list(itertools.chain.from_iterable(self.output)))
        
        # Step 5: Create the mappings from word to unique integer ID and the
        # reverse mapping. 

        n = 0
        for i in self.word_counts.keys():
            self.word_to_index[i] = n
            self.index_to_word[n] = i

            n += 1
        
        # Helpful print statement to verify what you've loaded
        print('Loaded all data from %s; saw %d usable tracks (%d unique words)' \
              % (file_name, len(corpus.output),
                 len(self.word_to_index)))
        
    def get_prediction_tensor(self, lyric):
        cachedStopWords = stopwords.words('english')
        # cachedStopWords = []
        lyric_re = re.sub("[\(\[].*?[\)\]]", "", lyric)
        t = self.tokenize(lyric_re)
        t_without_sw = [word for word in t if not word in cachedStopWords]
        t_without_sw_l = [word.lower() for word in t_without_sw]
        
        tra_dict = Counter(t_without_sw_l)
        
        r = []
        c = []
        d = []
        for word in tra_dict.keys():
            try:
                c.append(self.word_to_index[word])
                d.append(float(tra_dict[word]))
                r.append(0)
            except:
                continue
        i = [r, c]
        s = torch.sparse_coo_tensor(i, d, (1, len(corpus.word_counts)))
        return s.to_dense()
        

In [22]:
csv_name = '20220413_all.csv'
corpus = Corpus()
# corpus.load_data('20220311_training.csv', 5)
# corpus.load_data('train_0412_oldest.csv', 5)
corpus.load_data(csv_name, 5)

Reading data and tokenizing
Counting token frequencies
Performing minimum thresholding
Loaded all data from 20220413_all.csv; saw 3343 usable tracks (7212 unique words)


In [44]:
# build input data
tracks = corpus.output
row = []
col = []
data = []

row_num = 0
for track in tracks:
    track_dict = Counter(track)
    for word in track_dict.keys():
        col.append(corpus.word_to_index[word])
        # frequency
        # data.append(float(track_dict[word]))
        # TD-IDF
        data.append(float(track_dict[word])/sum(track_dict.values()))
        row.append(row_num)
    row_num += 1
    
i = [row, col]

s_input = torch.sparse_coo_tensor(i, data, (len(tracks), len(corpus.word_counts))).to_dense()
labels = torch.FloatTensor(corpus.labels)

In [45]:
sum(Counter(tracks[0]).values())

184

In [46]:
# combine numerical
raw_data = pd.read_csv(csv_name)

numeric = raw_data[['danceability', 'energy', 'key', 'loudness', 'speechiness',
          'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']].to_numpy()

t = torch.from_numpy(numeric)
num_only_input = t.float()
s_input_new = torch.cat((s_input, t), 1).float()

In [47]:
s_input.size(dim=1)

7212

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(s_input, # s_input_new for num+text, s_input for text only, num_only_input for numeric only
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=rs)

In [49]:
X_train = X_train.numpy()
y_train = y_train.numpy()
X_test = X_test.numpy()
y_test = y_test.numpy()

### Baseline 1: Linear Regression (sklearn)

In [50]:
y_train

array([18.16285 , 16.409067, 17.97158 , ..., 19.799917, 18.86811 ,
       18.08887 ], dtype=float32)

In [51]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

In [52]:
# model_lr =  LinearRegression(positive=True)
model_lr =  LinearRegression(positive=False) # for numeric only
                                    # criterion='squared_error')

# para_lr = {
#             'n_estimators': [200, 250, 300, 350, 400],
#             'max_depth': [10, 20, 30, 40, 50]
#             }
# g_lr = GridSearchCV(model_lr, para_lr, cv=5, n_jobs=-1, verbose=1)
# g_lr.fit(X_train, y_train)
# model_lr_f = g_lr.best_estimator_
model_lr.fit(X_train, y_train)
model_lr

LinearRegression()

In [53]:
# evaluation

def evaluation_sk(X_test, y_test, model):
#     test_data = pd.read_csv(test_file)
#     labels = np.log(test_data['data.album.tracks.items.track.playcount'].to_list())
#     lyrics = test_data['lyrics'].to_list()
    
#     pred_ys = []
#     for lyric in lyrics:
#         pred_y = model.predict(corpus.get_prediction_tensor(lyric).numpy())[0].tolist()
#         pred_ys.append(pred_y)
    pred_y = model.predict(X_test)
    
    return [mean_absolute_error(y_test, pred_y), mean_squared_error(y_test, pred_y)]

In [54]:
evaluation_sk(X_test, y_test, model_lr)

[1288.4697, 3855371.5]

In [55]:
# model_lr.predict(X_test)

### Baseline 2: random forest

In [56]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [57]:
# model_rf =  RandomForestRegressor(
#                                     random_state=rs, 
# #                                     criterion='squared_error',
#                         #             max_depth=None, 
#                                     min_samples_split=2)

# para_rf = {
#             'n_estimators': [200, 250, 300, 350, 400],
#             'max_depth': [10, 20, 30, 40, 50]
#             }
# g_rf = GridSearchCV(model_rf, para_rf, cv=5, n_jobs=-1, verbose=1)
# g_rf.fit(X_train, y_train)
# model_rf_f = g_rf.best_estimator_

In [58]:
# model_rf_f

In [59]:
model_rf_f = RandomForestRegressor(
                                    random_state=rs, 
                                    min_samples_split=2,
                                    n_estimators = 350,
                                    max_depth = 30
)
model_rf_f.fit(X_train, y_train)

RandomForestRegressor(max_depth=30, n_estimators=350, random_state=42)

In [60]:
evaluation_sk(X_test, y_test, model_rf_f)

[1.2516953080814168, 2.647404537225213]

### Baseline 3: random dummy

In [61]:
from sklearn.dummy import DummyRegressor

In [62]:
model_dummy = DummyRegressor(strategy="mean")
model_dummy.fit(X_train, y_train)

DummyRegressor()

In [63]:
evaluation_sk(X_test, y_test, model_dummy)

[1.5806782, 3.8248508]

### Model 1: SVR

In [64]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [120]:
# model_svm = SVR(kernel='rbf')

# para_svm = {
#             'C' : [0.1, 0.5, 1, 1.5, 2],
#             'epsilon': [0.001, 0.005, 0.01, 0.05, 0.1]
#             }
# g_svm = GridSearchCV(model_svm, para_svm, cv=5, n_jobs=-1, verbose=1)
# g_svm.fit(X_train, y_train)
# model_svm_f = g_svm.best_estimator_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [65]:
model_svm_f = SVR(kernel='rbf', C=1, epsilon=0.01)
model_svm_f.fit(X_train, y_train)

SVR(C=1, epsilon=0.01)

In [66]:
evaluation_sk(X_test, y_test, model_svm_f)

[1.318409096253149, 2.9159290886651443]

### Model 2: BERT miniLM

In [3]:
import transformers
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict, ClassLabel
from sklearn.metrics import mean_absolute_error, mean_squared_error
import random

In [4]:
from torch import nn
import torch
from transformers import Trainer

In [5]:
torch.cuda.is_available()

True

In [6]:
device = torch.device("cuda:0")

In [7]:
# train_data = pd.read_csv('train_0412_oldest.csv')
# eval_data = pd.read_csv('20220310_testing.csv')

In [8]:
raw_data = pd.read_csv('20220413_all.csv')

In [9]:
def load_data_to_dataset(df, load_lyrics = True, load_numeric = False, is_pred = False):
       
    if load_numeric == True:
        def n2t(danceability, energy, key, loudness, speechiness,
                acousticness, instrumentalness, liveness, valence, tempo):
            out = f"The danceability is {str(danceability)}. [SEP] The energy is {str(energy)}. [SEP] The key is {str(key)}. [SEP] The loudness is {str(loudness)}. [SEP] The speechiness is {str(speechiness)}. [SEP] The acousticness is {str(acousticness)}. [SEP] The instrumentalness is {str(instrumentalness)}. [SEP] The liveness is {str(liveness)}. [SEP] The valence is {str(valence)} [SEP] The tempo is {str(tempo)}."
            return out

        df['num2text'] = df.apply(lambda x: n2t(x.danceability, 
                                                    x.energy, 
                                                    x.key, 
                                                    x.loudness, 
                                                    x.speechiness, 
                                                    x.acousticness, 
                                                    x.instrumentalness, 
                                                    x.liveness, 
                                                    x.valence, 
                                                    x.tempo), axis=1)
        
        if load_lyrics == True:
            df['text'] = df[['num2text', 'lyrics']].agg(' [SEP] '.join, axis=1)
        else:
            df['text'] = df['num2text']
    else:
        if load_lyrics == True:
            df['text'] = df['lyrics']
        else:
            print("At least one of load_lyrics or load_numeric must be true.")
            exit()
    
    if is_pred == True:
        data = df['text']
        data['text'] = df['text'].str.replace(r'[\(\[].*?[\)\]]', '')
#         data = data.rename(columns={'lyrics': 'text'})
        output = data
        
    else:
        data = df[['text','data.album.tracks.items.track.playcount']]
        data['text'] = df['text'].str.replace(r'[\(\[].*?[\)\]]', '')
#         data = data.rename(columns={'lyrics': 'text'})
        data['labels'] = np.log(data['data.album.tracks.items.track.playcount'])
    
        output = Dataset.from_pandas(data, preserve_index=False)
    

    
    return output

In [10]:
model_ckpt = 'microsoft/MiniLM-L12-H384-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize_text(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

In [11]:
# train_ds = load_data_to_dataset(train_data, is_pred = False)
# train_ds = train_ds.map(tokenize_text, batched=True)

# eval_ds = load_data_to_dataset(eval_data, is_pred = False)
# eval_ds = eval_ds.map(tokenize_text, batched=True)

In [24]:
dataset = load_data_to_dataset(raw_data, load_lyrics = True, load_numeric = True, is_pred = False)
dataset = dataset.map(tokenize_text, batched=True)

dataset_dict = dataset.train_test_split(test_size=0.1)
train_ds = dataset_dict['train']
eval_ds = dataset_dict['test']

  data['text'] = df['text'].str.replace(r'[\(\[].*?[\)\]]', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = df['text'].str.replace(r'[\(\[].*?[\)\]]', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['labels'] = np.log(data['data.album.tracks.items.track.playcount'])


  0%|          | 0/4 [00:00<?, ?ba/s]

In [25]:
dataset_dict['test']['text'][1]

"The danceability is 0.921.  The energy is 0.472.  The key is 10.  The loudness is -7.88.  The speechiness is 0.43.  The acousticness is 0.164.  The instrumentalness is 0.432.  The liveness is 0.156.  The valence is 0.198  The tempo is 139.997.   Lot of shots  Lot of shots  30s on Glocks  30 shots  Lot of shots  Lot of shots  30s on Glocks  Lot of shots  30s on Glocks  Put you on Fox  Nigga we ain't goin'  Lot of shots  30s on Glocks  30s on Glocks If Young Metro don't trust you, I'm gon' shoot you  Lot of shots, hold up 30s on Glocks, hold up Put you on Fox, hold up Put 'em in a casket, yeah Pull up on a Banshee, yeah Nigga, fuck your handshake, yeah Feel like I'm the last real rapper 'cause these niggas weird Nah, these niggas queers Sippin' Act, Cheers In my own lane, nigga, watch where you steer Think I got 'em scared, shot 'em in the beard That's a chin check, I'm certified everywhere Man, I'm certified for real, nigga Nah, for real, nigga Used to record right on deal, nigga Nah, 

In [26]:
from transformers import AutoModelForSequenceClassification as AMFSC
model = AMFSC.from_pretrained(model_ckpt, num_labels=1)

loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/conanwu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://

In [27]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    mae = mean_absolute_error(labels, preds)
    mse = mean_squared_error(labels, preds)
    return {'mae': mae, 'mse':mse}

In [28]:
from transformers import TrainingArguments
batch_size = 32

logging_steps = len(train_ds) // batch_size
output_dir = 'minln-finetuned-regression'
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=5,
                                  learning_rate=5e-4,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy='epoch',
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  fp16=True
                                 )

trainer = Trainer(model = model, 
                  args = training_args, 
                  train_dataset = train_ds, 
                  eval_dataset = eval_ds,
                  compute_metrics = compute_metrics,
                  tokenizer = tokenizer)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend


In [29]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, data.album.tracks.items.track.playcount. If text, data.album.tracks.items.track.playcount are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3008
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 470


Epoch,Training Loss,Validation Loss,Mae,Mse
1,68.9555,3.635123,1.536411,3.635123
2,4.002,3.557034,1.516624,3.557034
3,4.0294,3.579984,1.522502,3.579984
4,4.0933,3.59084,1.525097,3.590839
5,4.0335,3.567364,1.519533,3.567364


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, data.album.tracks.items.track.playcount. If text, data.album.tracks.items.track.playcount are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 335
  Batch size = 32
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, data.album.tracks.items.track.playcount. If text, data.album.tracks.items.track.playcount are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 335
  Batch size = 32
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, data.album.tracks.items.t

TrainOutput(global_step=470, training_loss=17.02275910073138, metrics={'train_runtime': 131.5059, 'train_samples_per_second': 114.367, 'train_steps_per_second': 3.574, 'total_flos': 990708072775680.0, 'train_loss': 17.02275910073138, 'epoch': 5.0})