This neural network was written by Trevor Hubbard and Phil Sifferlin. It performs Hit Song Prediction using vector embeddings of a music-industry co-collaboration network

In [1]:
# Some import statements
import torch, json, random, time
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.preprocessing import StandardScaler

# Setting up Pytorch's use of CUDA 
device = torch.device("cuda")

**CLASSES**

*The next couple cells define the classes we'll use throughout this notebook.*

In [2]:
# This class will be used to help in the song data! 
class SongDataset(torch.utils.data.Dataset):

    # The init method defines how we'll input the data; 
    def __init__(self, embeddingTsvPath, audioJsonPath):

        # This dictionary will hold all of the data
        self.songDict = {}

        curTime = time.time()
        print("\nReading the network embeddings from the .tsv...")

        # Open the tsv and process the data in it
        with open(embeddingTsvPath, "r", encoding="utf-8") as tsvFile:

            # Iterate through each line of the .tsv and store the info
            hitCol = 0
            idCol = 0
            for lineNum, line in enumerate(tsvFile):

                line = line.strip()
                splitLine = line.split("\t")

                # If we're on the first line, figure out where the "hit" column is
                if (lineNum == 0): 
                    for idx, header in enumerate(splitLine):
                        if (header == "songID"):
                            idCol = idx
                        if (header == "hit"):
                            hitCol = idx
                    continue

                # Add the song to the songDict
                songID = int(splitLine[idCol])
                curHit = int(splitLine[hitCol])
                if (not songID in self.songDict):
                    self.songDict[songID] = {"hit": curHit, "embedding": None, "audio features": None}

                # Update the song's embedding 
                self.songDict[songID]["embedding"] = torch.tensor([float(x) for x in splitLine[hitCol+1:]], dtype=torch.float32, device="cuda")

        print("Finished reading in the embeddings! It took %.3f seconds" % (time.time()-curTime))
        curTime = time.time()
        print("\nReading in the Spotify data from the .json...")

        # Open the audio features JSON and process the data in it
        with open(audioJsonPath, "r", encoding="utf-8") as jsonFile:
            songData = json.load(jsonFile)["songs"]
            features = ['duration_ms', 'key', 'mode', 'time_signature', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence', 'tempo']
            for song in songData:
                songID = int(song["title"][1])
                curHit = int(song["hit"])
                if (song["audio_features"] not in [{}, None]):
                    if (songID not in self.songDict):
                        self.songDict[songID] = {"hit": curHit, "embedding": None, "audio features": None}
                    self.songDict[songID]["audio features"] = torch.tensor([song['audio_features'][feature] for feature in features], dtype=torch.float32, device="cuda")

        print("Finished reading in the .json! It took %.3f seconds" % (time.time()-curTime))

        # Remove any songs that don't have both an embedding and audio features
        curTime = time.time()
        print("\nRemoving songs without both an embedding and audio features...")
        hitCount = 0
        idsToRemove = []
        for songNum, songID in enumerate(self.songDict.keys()):
            song = self.songDict[songID]
            if ((song["embedding"] is None) or (song["audio features"] is None)):
                idsToRemove.append(songID)
                continue
            else:
                if (song["hit"] == 1): hitCount += 1
        for songID in idsToRemove:
            del self.songDict[songID]
        print("Finished removing the songs! It took %.3f seconds." % (time.time()-curTime))

        # Creating the songList (a list version of the songDict)
        shuffledSongDict = list(self.songDict.keys())
        np.random.shuffle(shuffledSongDict)
        self.songList = []
        nonHitCount = 0
        for songID in shuffledSongDict:
            song = self.songDict[songID]

            # Skip if this is a nonHit and we've already added all of those
            if (nonHitCount == hitCount and song["hit"] == 0):
                continue

            self.songList.append(song)
            self.songList[-1]["id"] = songID

            if (song["hit"] == 0):
                nonHitCount += 1

    # The len method returns the length of x_data
    def __len__(self):
        return len(self.songList)

    # The getitem method will specify how to return a particular index
    def __getitem__(self, idx):
        if (torch.is_tensor(idx)):
            idx = idx.tolist()
        song = self.songList[idx]
        songID = song["id"]
        emb = song["embedding"]
        audio_features = song["audio features"]
        hit = torch.tensor(song["hit"], dtype=torch.float32, device="cuda")
        return (songID, emb, audio_features, hit)

In [3]:
# This class is the Two Layer Net that we use for the audio features model**CLASSES**
class TwoLayerFC(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        nn.init.kaiming_normal_(self.fc1.weight)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        nn.init.kaiming_normal_(self.fc2.weight)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        scores = self.fc2(x)
        return scores

**METHODS**

*The next couple cells define the methods we'll use throughout this notebook.*

Creating the Dataset from the song embedding .tsv

In [5]:
songTsvPath = "C:\\Data\\College\\CS 682 - Neural Networks\\Project\\Task 3 - Network Development\\Data\\Song Embeddings - 128 dim.tsv"
songJsonPath = "C:\\Data\\College\\CS 682 - Neural Networks\\Project\\Task 1 - Data Collection\\Data\\Genius Info + Spotify Features, 1990-2010.json"
songs = SongDataset(songTsvPath, songJsonPath)


Reading the network embeddings from the .tsv...
Finished reading in the embeddings! It took 20.689 seconds

Reading in the Spotify data from the .json...
Finished reading in the .json! It took 12.087 seconds

Removing songs without both an embedding and audio features...
Finished removing the songs! It took 0.129 seconds.


Iterating through each of the songs to check if they're constructed correctly

In [6]:
_, emb, audio_features, _ = songs[0]
audioDimCount = len(audio_features)
embDimCount = len(emb)
print("The length of the songs dataset is %d" % len(songs))

The length of the songs dataset is 10086


Creating the training / validation split 

In [7]:
validationSplit = .2
songAmt = len(songs)
splitIdx = int(np.floor(songAmt * validationSplit))
indices = list(range(songAmt))
np.random.shuffle(indices)
trainIndices = indices[splitIdx:]
valIndices = indices[:splitIdx]

Creating the Samplers and DataLoaders for the train & validation data

In [8]:
sampler_train = SubsetRandomSampler(trainIndices)
sampler_val = SubsetRandomSampler(valIndices)
loader_train = torch.utils.data.DataLoader(songs, batch_size=64, sampler=sampler_train)
loader_val = torch.utils.data.DataLoader(songs, batch_size=64, sampler=sampler_val)

Define the embedding model using nn.Sequential

In [9]:
embHidden1 = 1024
embHidden2 = 512
embHidden3 = 256
embHidden4 = 32
embModel = nn.Sequential(nn.Linear(embDimCount, embHidden1),
                      nn.ReLU(),
                      nn.BatchNorm1d(embHidden1),
                      nn.Linear(embHidden1, embHidden2),
                      nn.ReLU(),
                      nn.BatchNorm1d(embHidden2),
                      nn.Linear(embHidden2, embHidden3),
                      nn.ReLU(),
                      nn.BatchNorm1d(embHidden3),
                      nn.Linear(embHidden3, embHidden4),
                      nn.ReLU(),
                      nn.BatchNorm1d(embHidden4),
                      nn.Linear(embHidden4, 1))

Define the audio model 

In [10]:
# Define the audio model using nn.Sequential
audioHidden1 = 32
audioHidden2 = 16
audioHidden3 = 8
audioModel = nn.Sequential(nn.BatchNorm1d(audioDimCount),
                           nn.Linear(audioDimCount, audioHidden1),
                           nn.ReLU(),
                           nn.BatchNorm1d(audioHidden1),
                           nn.Linear(audioHidden1, audioHidden2),
                           nn.ReLU(),
                           nn.BatchNorm1d(audioHidden2),
                           nn.Linear(audioHidden2, audioHidden3),
                           nn.ReLU(),
                           nn.BatchNorm1d(audioHidden3),
                           nn.Linear(audioHidden3, 1))

# audioModel = TwoLayerFC(audioDimCount, audioHidden, 1)

In [11]:
# Checking the makeup of the training and validation data
hitCount = 0
for (songID, emb, audio_features, hit) in loader_train:
    hitCount += len([x for x in list(hit) if x==1])
print("There are %d hits in the training data. (out of %d songs)" % (hitCount, len(loader_train)*64))

hitCount = 0
for (songID, emb, audio_features, hit) in loader_val:
    hitCount += len([x for x in list(hit) if x==1])
print("There are %d hits in the training data. (out of %d songs)" % (hitCount, len(loader_val)*64))

There are 3998 hits in the training data. (out of 8128 songs)
There are 1045 hits in the training data. (out of 2048 songs)


Train the embedding model

In [None]:
embEpochs = 1000
embLR = 0.00001
embModel = embModel.to("cuda")
embOptimizer = optim.Adam(embModel.parameters(), lr=embLR)

for e in range(embEpochs):
    for idx, (songID, emb, audio_features, hit) in enumerate(loader_train):

        # Indicate that we're in training mode
        embModel.train()
        audioModel.train()

        # Declaring a loss function
        loss_fn = nn.BCEWithLogitsLoss()

        # Perform a training step for the embedding model 
        embScores = embModel(emb)
        embScores = embScores.reshape(hit.shape)
        embLoss = loss_fn(embScores, hit)
        embOptimizer.zero_grad()
        embLoss.backward()
        embOptimizer.step()

    # Print the accuracy of the embedding model
    embValAcc = checkAccuracy(loader_val, embModel, "emb")
    embTrainAcc = checkAccuracy(loader_train, embModel, "emb")
    print("\nEMBEDDING MODEL:")
    print("Epoch %d: %.4f val accuracy" % (e, embValAcc))
    print("Epoch %d: %.4f train accuracy\n" % (e, embTrainAcc))

Train the audio model

In [None]:
audioEpochs = 1000
audioLR = 0.0005
audioModel = audioModel.to("cuda")
audioOptimizer = optim.RMSprop(audioModel.parameters(), lr=audioLR)

for e in range(audioEpochs):
    for idx, (songID, emb, audio_features, hit) in enumerate(loader_train):

        # Indicate that we're in training mode
        audioModel.train()

        # Declaring a loss function
        loss_fn = nn.BCEWithLogitsLoss()

        # Perform a training step for the audio model
        audioScores = audioModel(audio_features)
        audioScores = audioScores.reshape(hit.shape)
        audioLoss = loss_fn(audioScores, hit)
        audioOptimizer.zero_grad()
        audioLoss.backward()
        audioOptimizer.step()

    # Print the accuracy of the audio model
    audioValAcc = checkAccuracy(loader_val, audioModel, "audio")
    audioTrainAcc = checkAccuracy(loader_train, audioModel, "audio")
    print("\nAUDIO MODEL:")
    print("Epoch %d: %.4f val accuracy" % (e, audioValAcc))
    print("Epoch %d: %.4f train accuracy" % (e, audioTrainAcc))

Create a new, "split" network

In [148]:
embHidden1 = 2048
embHidden2 = 1024
embHidden3 = 512
embHidden4 = 256
embHidden5 = 128
embModel_cutoff = nn.Sequential(nn.Linear(embDimCount, embHidden1),
                      nn.ReLU(),
                      # nn.BatchNorm1d(embHidden1),
                      nn.Linear(embHidden1, embHidden2),
                      nn.ReLU(),
                      nn.BatchNorm1d(embHidden2),     
                      nn.Linear(embHidden2, embHidden3),
                      nn.ReLU(),
                      # nn.BatchNorm1d(embHidden3),
                      nn.Linear(embHidden3, embHidden4),
                      nn.ReLU(),
                      # nn.BatchNorm1d(embHidden4),
                      nn.Linear(embHidden4, embHidden5))


audioHidden1 = 1024
audioHidden2 = 512
audioHidden3 = 256
audioHidden4 = 128
audioModel_cutoff = nn.Sequential(nn.BatchNorm1d(audioDimCount),
                           nn.Linear(audioDimCount, audioHidden1),
                           nn.ReLU(),
                           # nn.BatchNorm1d(audioHidden1),
                           nn.Linear(audioHidden1, audioHidden2),
                           nn.ReLU(),
                           # nn.BatchNorm1d(audioHidden2),
                           nn.Linear(audioHidden2, audioHidden3),
                           nn.ReLU(),
                           # nn.BatchNorm1d(audioHidden3),
                           nn.Linear(audioHidden3, audioHidden4))


splitInput = embHidden5+audioHidden4
splitHidden1 = 1024
splitHidden2 = 512
splitHidden3 = 256
splitHidden4 = 128
splitHidden5 = 64
splitModel = nn.Sequential(nn.BatchNorm1d(splitInput),
                           nn.Linear(splitInput, splitHidden1),
                           nn.ReLU(),
                           nn.BatchNorm1d(splitHidden1),
                           nn.Dropout(0.1),
                           nn.Linear(splitHidden1, splitHidden2),
                           nn.ReLU(),
                           nn.BatchNorm1d(splitHidden2),
                           nn.Linear(splitHidden2, splitHidden3),
                           nn.ReLU(),
                           nn.BatchNorm1d(splitHidden3),
                           nn.Linear(splitHidden3, splitHidden4),
                           nn.ReLU(),
                           nn.BatchNorm1d(splitHidden4),
                           nn.Linear(splitHidden4, 1))

In [149]:
splitModel = splitModel.to("cuda")
audioModel_cutoff = audioModel_cutoff.to("cuda")
embModel_cutoff = embModel_cutoff.to("cuda")

In [150]:
# This method will check the accuracy of the model using data from the loader
def checkAccuracy(loader, model, modelType):

    num_correct = 0
    num_samples = 0
    model.eval()
    with torch.no_grad():
        for (songID, emb, audio_features, hit) in loader:
            binaryScores = []
            if (modelType == "emb"):
                binaryScores = torch.round(torch.sigmoid(model(emb))).reshape(hit.shape)
            elif (modelType == "audio"):
                binaryScores = torch.round(torch.sigmoid(model(audio_features))).reshape(hit.shape)
            elif (modelType == "split"):
                audioScores = audioModel_cutoff(audio_features)
                embScores = embModel_cutoff(emb)
                inputFeatures = torch.cat((embScores, audioScores), dim=1)
                binaryScores = torch.round(torch.sigmoid(model(inputFeatures))).reshape(hit.shape)
            num_correct += (hit == binaryScores).sum().float()
            num_samples += len(hit)
        return (num_correct/num_samples)


In [153]:
splitEpochs = 1000
splitLR = 0.00001
audioLR = 0.00001
embLR = 0.00001
splitOptimizer = optim.Adam(splitModel.parameters(), lr=splitLR)
embCutoffOptimizer = optim.Adam(audioModel_cutoff.parameters(), lr=embLR)
audioCutoffOptimizer = optim.Adam(embModel_cutoff.parameters(), lr=audioLR)

for e in range(splitEpochs):
    for idx, (songID, emb, audio_features, hit) in enumerate(loader_train):

        # Indicate that we're in training mode
        embModel_cutoff.train()
        audioModel_cutoff.train()
        splitModel.train()

        # Declaring a loss function
        loss_fn = nn.BCEWithLogitsLoss()

        # Perform a training step for the embedding model 
        embScores = embModel_cutoff(emb)
        
        # Perform a training step for the audio model
        audioScores = audioModel_cutoff(audio_features)
        
        # Now, use these scores to perform a training step for the split model 
        inputScores = torch.cat((embScores, audioScores), dim=1)

        splitScores = splitModel(inputScores)
        splitScores = splitScores.reshape(hit.shape)
        splitLoss = loss_fn(splitScores, hit)
        splitOptimizer.zero_grad()
        splitLoss.backward()
        splitOptimizer.step()
        audioCutoffOptimizer.step()
        embCutoffOptimizer.step()
        

    # Print the accuracy of the embedding model
    splitValAcc = checkAccuracy(loader_val, splitModel, "split")
    splitTrainAcc = checkAccuracy(loader_train, splitModel, "split")
    print("\nSPLIT MODEL:")
    print("Epoch %d: %.4f val accuracy" % (e, splitValAcc))
    print("Epoch %d: %.4f train accuracy\n" % (e, splitTrainAcc))
    


SPLIT MODEL:
Epoch 0: 0.8483 val accuracy
Epoch 0: 0.9372 train accuracy


SPLIT MODEL:
Epoch 1: 0.8185 val accuracy
Epoch 1: 0.9199 train accuracy


SPLIT MODEL:
Epoch 2: 0.8349 val accuracy
Epoch 2: 0.9289 train accuracy


SPLIT MODEL:
Epoch 3: 0.8413 val accuracy
Epoch 3: 0.9331 train accuracy


SPLIT MODEL:
Epoch 4: 0.8433 val accuracy
Epoch 4: 0.9395 train accuracy


SPLIT MODEL:
Epoch 5: 0.8339 val accuracy
Epoch 5: 0.9409 train accuracy


SPLIT MODEL:
Epoch 6: 0.8433 val accuracy
Epoch 6: 0.9482 train accuracy


SPLIT MODEL:
Epoch 7: 0.8354 val accuracy
Epoch 7: 0.9554 train accuracy


SPLIT MODEL:
Epoch 8: 0.8409 val accuracy
Epoch 8: 0.9586 train accuracy


SPLIT MODEL:
Epoch 9: 0.8409 val accuracy
Epoch 9: 0.9533 train accuracy


SPLIT MODEL:
Epoch 10: 0.8369 val accuracy
Epoch 10: 0.9559 train accuracy


SPLIT MODEL:
Epoch 11: 0.8354 val accuracy
Epoch 11: 0.9606 train accuracy


SPLIT MODEL:
Epoch 12: 0.8354 val accuracy
Epoch 12: 0.9581 train accuracy


SPLIT MODEL:
Epoch

KeyboardInterrupt: 