## *Neural-A&R*: Hit Song Prediction using Social Network + High Level Acoustic Data

This Jupyter Notebook was written by Trevor Hubbard and Phillip Sifferlin. It performs Hit Song Prediction using two data sources: 

- [High-level acoustic data](https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-features/) from Spotify
- Vector-embeddings of a music industry social network 

More information about the project can be found [in this GitHub repo.](https://github.com/tmhubbard/CS-682-Hit-Song-Science-Project) 

In [1]:
# Some import statements
import torch, json, random, time
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.preprocessing import StandardScaler

# Setting up Pytorch's use of CUDA; if your computer isn't CUDA-enabled, 
# you could replace "cuda" with "cpu"
device = torch.device("cuda")

### Creating the SongDataset Class
In the following cell, we're defining SongDataset; this class extends Pytorch's Dataset class, and will be used to more easily load the data from the respective .tsv's / .json's for the neural networks. 

The SongDataset class will ensure that the hits / non-hits are balanced; we've got around 200,000 songs in the dataset (~5,000 of which are hits), and the SongDataset constructor will randomly select an equal number of hits and non-hits. 

In [2]:
# This class will be used to help in the song data! 
class SongDataset(torch.utils.data.Dataset):

    # The init method defines how we'll input the data; 
    def __init__(self, embeddingTsvPath, audioJsonPath):

        # This dictionary will hold all of the data
        self.songDict = {}

        curTime = time.time()
        print("\nReading the network embeddings from the .tsv...")

        # Open the tsv and process the data in it
        with open(embeddingTsvPath, "r", encoding="utf-8") as tsvFile:

            # Iterate through each line of the .tsv and store the info
            hitCol = 0
            idCol = 0
            for lineNum, line in enumerate(tsvFile):

                line = line.strip()
                splitLine = line.split("\t")

                # If we're on the first line, figure out where the "hit" column is
                if (lineNum == 0): 
                    for idx, header in enumerate(splitLine):
                        if (header == "songID"):
                            idCol = idx
                        if (header == "hit"):
                            hitCol = idx
                    continue

                # Add the song to the songDict
                songID = int(splitLine[idCol])
                curHit = int(splitLine[hitCol])
                if (not songID in self.songDict):
                    self.songDict[songID] = {"hit": curHit, "embedding": None, "audio features": None}

                # Update the song's embedding 
                self.songDict[songID]["embedding"] = torch.tensor([float(x) for x in splitLine[hitCol+1:]], dtype=torch.float32, device="cuda")

        print("Finished reading in the embeddings! It took %.3f seconds" % (time.time()-curTime))
        curTime = time.time()
        print("\nReading in the Spotify data from the .json...")

        # Open the audio features JSON and process the data in it
        with open(audioJsonPath, "r", encoding="utf-8") as jsonFile:
            songData = json.load(jsonFile)["songs"]
            features = ['duration_ms', 'key', 'mode', 'time_signature', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence', 'tempo']
            for song in songData:
                songID = int(song["title"][1])
                curHit = int(song["hit"])
                if (song["audio_features"] not in [{}, None]):
                    if (songID not in self.songDict):
                        self.songDict[songID] = {"hit": curHit, "embedding": None, "audio features": None}
                    self.songDict[songID]["audio features"] = torch.tensor([song['audio_features'][feature] for feature in features], dtype=torch.float32, device="cuda")

        print("Finished reading in the .json! It took %.3f seconds" % (time.time()-curTime))

        # Remove any songs that don't have both an embedding and audio features
        curTime = time.time()
        print("\nRemoving songs without both an embedding and audio features...")
        hitCount = 0
        idsToRemove = []
        for songNum, songID in enumerate(self.songDict.keys()):
            song = self.songDict[songID]
            if ((song["embedding"] is None) or (song["audio features"] is None)):
                idsToRemove.append(songID)
                continue
            else:
                if (song["hit"] == 1): hitCount += 1
        for songID in idsToRemove:
            del self.songDict[songID]
        print("Finished removing the songs! It took %.3f seconds." % (time.time()-curTime))

        # Creating the songList (a list version of the songDict)
        shuffledSongDict = list(self.songDict.keys())
        np.random.shuffle(shuffledSongDict)
        self.songList = []
        nonHitCount = 0
        for songID in shuffledSongDict:
            song = self.songDict[songID]

            # Skip if this is a nonHit and we've already added all of those
            if (nonHitCount == hitCount and song["hit"] == 0):
                continue

            self.songList.append(song)
            self.songList[-1]["id"] = songID

            if (song["hit"] == 0):
                nonHitCount += 1

    # The len method returns the length of x_data
    def __len__(self):
        return len(self.songList)

    # The getitem method will specify how to return a particular index
    def __getitem__(self, idx):
        if (torch.is_tensor(idx)):
            idx = idx.tolist()
        song = self.songList[idx]
        songID = song["id"]
        emb = song["embedding"]
        audio_features = song["audio features"]
        hit = torch.tensor(song["hit"], dtype=torch.float32, device="cuda")
        return (songID, emb, audio_features, hit)

### Creating Training / Validation Splits

After defining the SongDataset class, we can import the data files (a .json for the acoustic data, and .tsv for the social network embeddings) and create a new SongDataset object from them. 

In [3]:
# If you're running this on your own computer, replace these paths w/ 
# a path to wherever you put the data. 
songTsvPath = "C:\\Data\\College\\CS 682 - Neural Networks\\Project\\Task 3 - Network Development\\Data\\Song Embeddings - 128 dim (half).tsv"
songJsonPath = "C:\\Data\\College\\CS 682 - Neural Networks\\Project\\Task 1 - Data Collection\\Data\\Genius Info + Spotify Features, 1990-2010.json"
songs = SongDataset(songTsvPath, songJsonPath)


Reading the network embeddings from the .tsv...
Finished reading in the embeddings! It took 22.341 seconds

Reading in the Spotify data from the .json...
Finished reading in the .json! It took 12.948 seconds

Removing songs without both an embedding and audio features...
Finished removing the songs! It took 0.138 seconds.


In [4]:
# Declare what percentage of the data will be validation data, and then 
# split up the songs into training and validation sets
validationSplit = .2
songAmt = len(songs)
splitIdx = int(np.floor(songAmt * validationSplit))
indices = list(range(songAmt))
np.random.shuffle(indices)
trainIndices = indices[splitIdx:]
valIndices = indices[:splitIdx]

In [5]:
# Create DataLoader objects for the Training and Validation data
sampler_train = SubsetRandomSampler(trainIndices)
sampler_val = SubsetRandomSampler(valIndices)
loader_train = torch.utils.data.DataLoader(songs, batch_size=64, sampler=sampler_train)
loader_val = torch.utils.data.DataLoader(songs, batch_size=64, sampler=sampler_val)

### Creating checkAccuracy( ) 

The following method, checkAccuracy(), will be used to check the accuracy of a given model. Since we've defined a number of different models throughout the notebook, there are a couple of different ways to handle checking the accuracy. Don't worry about each different model right now - they'll make more sense as you progress through the notebook! 

In [6]:
# This method will check the accuracy of the model using data from the loader
def checkAccuracy(loader, model, modelType):

    num_correct = 0
    num_samples = 0
    model.eval()
    with torch.no_grad():
        for (songID, emb, audio_features, hit) in loader:
            binaryScores = []
            if (modelType == "emb"):
                binaryScores = torch.round(torch.sigmoid(model(emb))).reshape(hit.shape)
            elif (modelType == "audio"):
                binaryScores = torch.round(torch.sigmoid(model(audio_features))).reshape(hit.shape)
            elif (modelType == "split"):
                audioScores = audioModel_cutoff(audio_features)
                embScores = embModel_cutoff(emb)
                inputFeatures = torch.cat((embScores, audioScores), dim=1)
                binaryScores = torch.round(torch.sigmoid(model(inputFeatures))).reshape(hit.shape)
            elif(modelType == "concat"):
                inputData = torch.cat((emb, audio_features), dim=1)
                binaryScores = torch.round(torch.sigmoid(model(inputData))).reshape(hit.shape)
            elif(modelType == "weight"):
                embScores = embModel(emb).squeeze(1)
                audioScores = audioModel(audio_features).squeeze(1)
                weights = weightModel(torch.cat((emb, audio_features), dim=1))
                weightedScores = (embScores * weights[:, 0]) + (audioScores * weights[:, 1])
                binaryScores = torch.round(torch.sigmoid(weightedScores)).reshape(hit.shape)
            num_correct += (hit == binaryScores).sum().float()
            num_samples += len(hit)
        return (num_correct/num_samples)


### Acoustic Data Model
We figured it'd be interesting to train separate models on either set of data before combining them together; that way, we'd be able to see which data source (the social network embeddings, or the high-level acoustic data) contained more useful predictive information. 

The following cells are associated with setting up the model that's only trained on the acoustic data. This is a 4-layer, fully connected network with a couple of Batch-Normalization layers slipped throughout. We've found that it'll generally achieve ~74% accuracy on the validation set. 

In [7]:
# Define the audio model using nn.Sequential
audioDimCount = (len(songs[0][2]))
audioHidden1 = 256
audioHidden2 = 128
audioHidden3 = 32
audioModel = nn.Sequential(nn.BatchNorm1d(audioDimCount),
                           nn.Linear(audioDimCount, audioHidden1),
                           nn.LeakyReLU(),
                           nn.BatchNorm1d(audioHidden1),
                           nn.Linear(audioHidden1, audioHidden2),
                           nn.LeakyReLU(),
                           nn.BatchNorm1d(audioHidden2),
                           nn.Linear(audioHidden2, audioHidden3),
                           nn.LeakyReLU(),
                           nn.BatchNorm1d(audioHidden3),
                           nn.Linear(audioHidden3, 1))

In [8]:
# Train the acoustic model
audioEpochs = 10
audioLR = 0.0001
audioModel = audioModel.to("cuda")
audioOptimizer = optim.RMSprop(audioModel.parameters(), lr=audioLR)

for e in range(audioEpochs):
    for idx, (songID, emb, audio_features, hit) in enumerate(loader_train):

        # Indicate that we're in training mode
        audioModel.train()

        # Declaring a loss function
        loss_fn = nn.BCEWithLogitsLoss()

        # Perform a training step for the audio model
        audioScores = audioModel(audio_features)
        audioScores = audioScores.reshape(hit.shape)
        audioLoss = loss_fn(audioScores, hit)
        audioOptimizer.zero_grad()
        audioLoss.backward()
        audioOptimizer.step()

    # Print the accuracy of the audio model
    audioValAcc = checkAccuracy(loader_val, audioModel, "audio")
    audioTrainAcc = checkAccuracy(loader_train, audioModel, "audio")
    print("\nAUDIO MODEL:")
    print("Epoch %d: %.4f val accuracy" % (e, audioValAcc))
    print("Epoch %d: %.4f train accuracy" % (e, audioTrainAcc))


AUDIO MODEL:
Epoch 0: 0.7100 val accuracy
Epoch 0: 0.7215 train accuracy

AUDIO MODEL:
Epoch 1: 0.7105 val accuracy
Epoch 1: 0.7431 train accuracy

AUDIO MODEL:
Epoch 2: 0.7199 val accuracy
Epoch 2: 0.7421 train accuracy

AUDIO MODEL:
Epoch 3: 0.7124 val accuracy
Epoch 3: 0.7513 train accuracy

AUDIO MODEL:
Epoch 4: 0.7194 val accuracy
Epoch 4: 0.7549 train accuracy

AUDIO MODEL:
Epoch 5: 0.7199 val accuracy
Epoch 5: 0.7544 train accuracy

AUDIO MODEL:
Epoch 6: 0.7179 val accuracy
Epoch 6: 0.7601 train accuracy

AUDIO MODEL:
Epoch 7: 0.7194 val accuracy
Epoch 7: 0.7560 train accuracy

AUDIO MODEL:
Epoch 8: 0.7219 val accuracy
Epoch 8: 0.7633 train accuracy

AUDIO MODEL:
Epoch 9: 0.7139 val accuracy
Epoch 9: 0.7566 train accuracy


### Network Embedding Model
Below, you'll find a declaration of the Network Embedding model. It's a 5-layer, fully connected network with some Batch-Normalization layers spread throughout. We've found that it'll generally achieve ~85% accuracy on the validation set! 

In [14]:
# Declare the Network Embedding model
embDimCount = (len(songs[0][1]))
embHidden1 = 1024
embHidden2 = 512
embHidden3 = 256
embHidden4 = 128
embModel = nn.Sequential(nn.Linear(embDimCount, embHidden1),
                      nn.LeakyReLU(),
                      nn.BatchNorm1d(embHidden1),
                      nn.Linear(embHidden1, embHidden2),
                      nn.LeakyReLU(),
                      nn.BatchNorm1d(embHidden2),
                      nn.Linear(embHidden2, embHidden3),
                      nn.LeakyReLU(),
                      nn.BatchNorm1d(embHidden3),
                      nn.Linear(embHidden3, embHidden4),
                      nn.LeakyReLU(),
                      nn.BatchNorm1d(embHidden4),
                      nn.Linear(embHidden4, 1))

In [15]:
# Train the Network Embedding model
embEpochs = 100
embLR = 0.000025
embModel = embModel.to("cuda")
embOptimizer = optim.Adam(embModel.parameters(), lr=embLR)
embValAccList = []
embTrainAccList = []

for e in range(embEpochs):
    for idx, (songID, emb, audio_features, hit) in enumerate(loader_train):
        
        # Indicate that we're in training mode
        embModel.train()

        # Declaring a loss function
        loss_fn = nn.BCEWithLogitsLoss()

        # Perform a training step for the embedding model 
        embScores = embModel(emb)
        embScores = embScores.reshape(hit.shape)
        embLoss = loss_fn(embScores, hit)
        embOptimizer.zero_grad()
        embLoss.backward()
        embOptimizer.step()

    # Print the accuracy of the embedding model
    embValAcc = checkAccuracy(loader_val, embModel, "emb")
    embTrainAcc = checkAccuracy(loader_train, embModel, "emb")
    embValAccList.append(embValAcc)
    embTrainAccList.append(embTrainAcc)
    print("\nEMBEDDING MODEL:")
    print("Epoch %d: %.4f val accuracy" % (e, embValAcc))
    print("Epoch %d: %.4f train accuracy\n" % (e, embTrainAcc))


EMBEDDING MODEL:
Epoch 0: 0.8270 val accuracy
Epoch 0: 0.8352 train accuracy


EMBEDDING MODEL:
Epoch 1: 0.8324 val accuracy
Epoch 1: 0.8557 train accuracy


EMBEDDING MODEL:
Epoch 2: 0.8369 val accuracy
Epoch 2: 0.8693 train accuracy


EMBEDDING MODEL:
Epoch 3: 0.8409 val accuracy
Epoch 3: 0.8845 train accuracy


EMBEDDING MODEL:
Epoch 4: 0.8354 val accuracy
Epoch 4: 0.8896 train accuracy


EMBEDDING MODEL:
Epoch 5: 0.8409 val accuracy
Epoch 5: 0.8968 train accuracy


EMBEDDING MODEL:
Epoch 6: 0.8468 val accuracy
Epoch 6: 0.9021 train accuracy


EMBEDDING MODEL:
Epoch 7: 0.8463 val accuracy
Epoch 7: 0.9042 train accuracy


EMBEDDING MODEL:
Epoch 8: 0.8428 val accuracy
Epoch 8: 0.9129 train accuracy


EMBEDDING MODEL:
Epoch 9: 0.8443 val accuracy
Epoch 9: 0.9216 train accuracy


EMBEDDING MODEL:
Epoch 10: 0.8443 val accuracy
Epoch 10: 0.9273 train accuracy


EMBEDDING MODEL:
Epoch 11: 0.8463 val accuracy
Epoch 11: 0.9305 train accuracy


EMBEDDING MODEL:
Epoch 12: 0.8413 val accuracy


### Model Mixture #1: Bagging

Throughout the rest of this Notebook, we attempted different strategies for combining the acoustic + network data. Our first attempt was a Bagging strategy - we can simply take the average of the scores of the acoustic model and the embedding model, and use this as our new score. (So, if embModel guessed 0.85, and audioModel guessed 0.35, then we'd get an average score of 0.6 - we'd then round this up to 1, indicating a hit!) 

This meta-model seemed to perform about as well as the embedding model; it didn't seem to improve accuracy in any significant way. 

In [None]:
# This bagging "meta-model" doesn't require any training, since we're just
# averaging the output of two pre-trained models: embModel and audioModel. 
# Store the songIDs for each of the correct songs in bothRightGuesses.
bothRightGuesses = []
embModel.eval()
audioModel.eval()
with torch.no_grad():
    for (songID, emb, audio_features, hit) in loader_val:
        embScores = (torch.sigmoid(embModel(emb))).reshape(hit.shape)
        audioScores = (torch.sigmoid(audioModel(audio_features))).reshape(hit.shape)
        bothScores = (embScores + audioScores)/2
        binaryScores = torch.round(bothScores)
        mask = (binaryScores == hit)
        bothRightGuesses += ([int(x) for x in (songID[mask])])

In order to analyse the results of the bagging model, run the cells below; they'll collect the correct guesses of both the embedding and acoustic models, and then compare the sets of correct guesses with the correct guesses of the bagged model. 

In [None]:
# Figure out which songs the embedding model
embRightGuesses = []
embModel.eval()
with torch.no_grad():
    for (songID, emb, audio_features, hit) in loader_val:
        binaryScores = torch.round(torch.sigmoid(embModel(emb))).reshape(hit.shape)
        mask = (binaryScores == hit)
        embRightGuesses += ([int(x) for x in (songID[mask])])
        
audioRightGuesses = []
audioModel.eval()
with torch.no_grad():
    for (songID, emb, audio_features, hit) in loader_val:
        binaryScores = torch.round(torch.sigmoid(audioModel(audio_features))).reshape(hit.shape)
        mask = (binaryScores == hit)
        audioRightGuesses += ([int(x) for x in (songID[mask])])

In [None]:
embRightGuessesSet = set(embRightGuesses)
audioRightGuessesSet = set(audioRightGuesses)
bothRightGuessesSet = set(bothRightGuesses)
print("There were %d songs in the validation set..." % len(valIndices))
print("The embedding model got %d songs correct (%.2f%% accuracy)" % (len(embRightGuessesSet), 100 * (len(embRightGuessesSet)/len(valIndices))))
print("The audio model got %d songs correct (%.2f%% accuracy)" % (len(audioRightGuesses), 100 * (len(audioRightGuesses)/len(valIndices))))
print("The bagged model got %d songs correct (%.2f%% accuracy)" % (len(bothRightGuesses), 100 * (len(bothRightGuesses)/len(valIndices))))


### Model Mixture #2: "Split" Network

For our next model mixture, we tried to create a "split" network - essentially, we train two separate models (embModel_cutoff and audioModel_cutoff), which have the same architecture as the regular embModel and audioModels, except cutoff at the last hidden layers. Then, we concatenate the outputs of these models together, and use that as input data for the third model: splitModel. Then, we backpropagate the loss through both of the networks. 

<img src="https://i.imgur.com/ERZ0H3M.png" width=600 height=650/>

This mixture recieves around 85% accuracy on the validation set, occasionally pushing through to 86%.

In [None]:
dropoutStrength = 0.7

embDimCount = (len(songs[0][1]))
embHidden1 = 1024
embHidden2 = 512
embHidden3 = 256
embHidden4 = 128
embModel_cutoff = nn.Sequential(nn.Linear(embDimCount, embHidden1),
                      nn.LeakyReLU(),
                      nn.BatchNorm1d(embHidden1),
                      nn.Linear(embHidden1, embHidden2),
                      nn.LeakyReLU(),
                      nn.Linear(embHidden2, embHidden3),
                      nn.LeakyReLU(),
                      nn.BatchNorm1d(embHidden3),
                      nn.Linear(embHidden3, embHidden4),
                      nn.Dropout(dropoutStrength))


audioDimCount = (len(songs[0][2]))
audioHidden1 = 256 
audioHidden2 = 128
audioHidden3 = 64 
audioModel_cutoff = nn.Sequential(nn.BatchNorm1d(audioDimCount),
                           nn.Linear(audioDimCount, audioHidden1),
                           nn.LeakyReLU(),
                           nn.Linear(audioHidden1, audioHidden2),
                           nn.LeakyReLU(),
                           nn.BatchNorm1d(audioHidden2),
                           nn.Linear(audioHidden2, audioHidden3),
                           nn.Dropout(dropoutStrength))


splitInput = embHidden4+audioHidden3
splitHidden1 = 1024
splitHidden2 = 512
splitHidden3 = 256
splitHidden4 = 128
splitModel = nn.Sequential(nn.Linear(splitInput, splitHidden1),
                           nn.LeakyReLU(),
                           nn.Dropout(dropoutStrength),
                           nn.Linear(splitHidden1, splitHidden2),
                           nn.LeakyReLU(),
                           nn.Linear(splitHidden2, splitHidden3),
                           nn.LeakyReLU(),
                           nn.Dropout(dropoutStrength),
                           nn.Linear(splitHidden3, splitHidden4),
                           nn.LeakyReLU(),
                           nn.Linear(splitHidden4, 1))

In [None]:
splitModel = splitModel.to("cuda")
audioModel_cutoff = audioModel_cutoff.to("cuda")
embModel_cutoff = embModel_cutoff.to("cuda")

splitEpochs = 20
splitLR = 0.000005
audioLR = 0.00007
embLR = 0.00007
splitOptimizer = optim.Adam(splitModel.parameters(), lr=splitLR)
embCutoffOptimizer = optim.Adam(audioModel_cutoff.parameters(), lr=embLR)
audioCutoffOptimizer = optim.Adam(embModel_cutoff.parameters(), lr=audioLR)

for e in range(splitEpochs):
    for idx, (songID, emb, audio_features, hit) in enumerate(loader_train):

        # Indicate that we're in training mode
        embModel_cutoff.train()
        audioModel_cutoff.train()
        splitModel.train()

        # Declaring a loss function
        loss_fn = nn.BCEWithLogitsLoss()

        # Perform a training step for the embedding model 
        embScores = embModel_cutoff(emb)
        
        # Perform a training step for the audio model
        audioScores = audioModel_cutoff(audio_features)
        
        # Now, use these scores to perform a training step for the split model 
        inputScores = torch.cat((embScores, audioScores), dim=1)

        splitScores = splitModel(inputScores)
        splitScores = splitScores.reshape(hit.shape)
        splitLoss = loss_fn(splitScores, hit)
        splitOptimizer.zero_grad()
        audioCutoffOptimizer.zero_grad()
        embCutoffOptimizer.zero_grad()
        splitLoss.backward()
        splitOptimizer.step()
        audioCutoffOptimizer.step()
        embCutoffOptimizer.step()
        

    # Print the accuracy of the embedding model
    splitValAcc = checkAccuracy(loader_val, splitModel, "split")
    splitTrainAcc = checkAccuracy(loader_train, splitModel, "split")
    print("\nSPLIT MODEL:")
    print("Epoch %d: %.4f val accuracy" % (e, splitValAcc))
    print("Epoch %d: %.4f train accuracy\n" % (e, splitTrainAcc))
    

### Model Mixture #3: Data Concatenation

Next, we tried to train a model on a mixture of the data by concatenating the acoustic and embedding data together, and then training the model on that. Typically, this model underperformed compared to the previous one; it achieved around 83-84% accuracy on the validation set.

<img src="https://i.imgur.com/ZQznqxT.png"  width=800 height=500/>

In [None]:
concatInput  = audioDimCount + embDimCount
concatHidden1 = 1024
concatHidden2 = 512
concatHidden3 = 256
concatHidden4 = 128
concatModel = nn.Sequential(nn.BatchNorm1d(concatInput),
                            nn.Linear(concatInput, concatHidden1),
                            nn.LeakyReLU(),
                            #nn.BatchNorm1d(concatHidden1),
                            nn.Linear(concatHidden1, concatHidden2),
                            nn.LeakyReLU(),
                            #nn.BatchNorm1d(concatHidden2),
                            nn.Linear(concatHidden2, concatHidden3),
                            nn.LeakyReLU(),
                            #nn.BatchNorm1d(concatHidden3),
                            nn.Linear(concatHidden3, concatHidden4),
                            nn.LeakyReLU(),
                            nn.Linear(concatHidden4, 1))

In [None]:
concatEpochs = 10
concatLR =0.0001
concatModel = concatModel.to("cuda")
concatOptimizer = optim.Adam(concatModel.parameters(), lr=concatLR)

for e in range(concatEpochs):
    for idx, (songID, emb, audio_features, hit) in enumerate(loader_train):
        
        # Indicate that we're in training mode
        concatModel.train()

        # Declaring a loss function
        loss_fn = nn.BCEWithLogitsLoss()

        # Perform a training step for the embedding model
        inputData = torch.cat((emb, audio_features), dim=1)
        concatScores = concatModel(inputData)
        concatScores = concatScores.reshape(hit.shape)
        concatLoss = loss_fn(concatScores, hit)
        concatOptimizer.zero_grad()
        concatLoss.backward()
        concatOptimizer.step()

    # Print the accuracy of the embedding model
    concatValAcc = checkAccuracy(loader_val, concatModel, "concat")
    concatTrainAcc = checkAccuracy(loader_train, concatModel, "concat")
    print("\nEMBEDDING MODEL:")
    print("Epoch %d: %.4f val accuracy" % (e, concatValAcc))
    print("Epoch %d: %.4f train accuracy\n" % (e, concatTrainAcc))

### Model Mixture #4: "WeightNet"
da da da 

In [None]:
weightInput   = audioDimCount + embDimCount
weightHidden1 = 64
weightHidden2 = 32


weightModel = nn.Sequential(nn.BatchNorm1d(weightInput),
                            nn.Linear(weightInput, weightHidden1),
                            nn.ReLU(),
                            nn.BatchNorm1d(weightHidden1),
                            nn.Linear(weightHidden1, weightHidden2),
                            nn.ReLU(),                      
                            nn.BatchNorm1d(weightHidden2))

weightModel = weightModel.to("cuda")
audioModel = audioModel.to("cuda")
embModel = embModel.to("cuda")

weightEpochs = 40
weightLR = 0.001
weightOptimizer = optim.Adam(weightModel.parameters(), lr=weightLR)

for e in range(weightEpochs):
    for idx, (songID, emb, audio_features, hit) in enumerate(loader_train):

        # Indicate that we're in training mode
        weightModel.train()

        # Declaring a loss function
        loss_fn = nn.BCEWithLogitsLoss()

        embScores = embModel(emb).squeeze(1)        
        audioScores = audioModel(audio_features).squeeze(1)

        weights = weightModel(torch.cat((emb, audio_features), dim=1))
        weightedScores = (embScores * weights[:, 0]) + (audioScores * weights[:, 1])

        weightLoss = loss_fn(weightedScores, hit)
        weightOptimizer.zero_grad()
        weightLoss.backward()
        weightOptimizer.step()

    # Print the accuracy of the embedding model
    weightValAcc = checkAccuracy(loader_val, weightModel, "weight")
    weightTrainAcc = checkAccuracy(loader_train, weightModel, "weight")
    print("\nWEIGHT MODEL:")
    print("Epoch %d: %.4f val accuracy" % (e, weightValAcc))
    print("Epoch %d: %.4f train accuracy\n" % (e, weightTrainAcc))