# Using my model to analyse the Beatles discography

In this notebook I decided to mess around with my Beatles discography to see what I can uncover from this model. 

I sliced each audio file into spectrograms of 128x1024 pixels each, with an average of 23.78s of music per slice. I ran it through my pretrained convnet with the finetuned ResNet50 architecture, and found the mean for the classifiers for each softmax classifier to get the classification for each track.

In [268]:
from PIL import Image
import librosa as lib
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.image import ImageDataGenerator
from keras.models import load_model
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sklearn.manifold import TSNE
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%matplotlib notebook
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import pprint

In [152]:
plotly.__version__

'2.2.1'

In [6]:
def cut_mel(mel):
    '''Cuts the mel spectrogram up into 128*1024 size chunks to pass into the convolutional model'''
    dict_of_mels = dict()
    cut = int(mel.shape[1]/1024)
    for i in range(1, cut+1):
        dict_of_mels[i] = mel[:,(i-1)*1024:i*1024]
    
    return dict_of_mels


For this cell, the files have to be in a directory "Beatles_Discography" with internal subdirectories being the various albums of the discography. I looped through every track in the discography, converted them into their respective melspectrograms, and saved it in the directory "Beatlesimages" for the convnet's predict generator to call from.

In [9]:
scaler = MinMaxScaler()
directory = 'Beatles_Discography'

for album in os.listdir(directory):
    if not album.startswith('.'):
        for file in os.listdir(directory + '/' + album):
            if not file.startswith('.'):
                f, sr = lib.core.load(directory + '/' + album + '/' + file)    #loads file
                mel = lib.feature.melspectrogram(f, n_mels = 128)    # creates mel spectrogram
                mel = lib.power_to_db(mel, ref=np.max)    # power scaled to db
                melscaled = scaler.fit_transform(mel)     # fit transform between 0 and 1
                x = np.array(melscaled)
                dict_of_mels = cut_mel(x)

                albumrenamed = album.replace(" ", "_")
                filerenamed = file[3:-4].replace(" ","_")

                newdirectory = 'Beatlesimages' + '/' + albumrenamed + '/' + filerenamed
                if not os.path.exists(newdirectory):
                    os.makedirs(newdirectory)   # makes directory path

                for index,melvalues in dict_of_mels.items():
                    melstacked = np.stack((melvalues,)*3, axis=2)    # making it 3 channel
                    rescaled = (255.0 /melstacked.max() * (melstacked - melstacked.min())).astype(np.uint8)
                    im = Image.fromarray(rescaled)    # forming an image
                    im.save('Beatlesimages/{}/{}/{}.jpeg'.format(albumrenamed,filerenamed, str(index)))


Now I'm going to call the flow generator object to feed into the model.

In [63]:
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory('Beatlesimages/',
                                                target_size = (200,1024), batch_size = 28,
                                                class_mode = None, shuffle=False)
for data_batch in test_generator:
    print('data batch shape:', data_batch.shape)
    break

Found 1372 images belonging to 15 classes.
data batch shape: (28, 200, 1024, 3)


In [57]:
print(len(test_generator.filenames))

1372


Time to load the convolutional model.

In [61]:
convmodel = load_model('resnet50model.h5')
convmodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50 (Model)             (None, 1, 4, 2048)        23587712  
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 4, 2048)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              8389632   
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
activation_50 (Activation)   (None, 1024)              0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)              0         
__________

In [64]:
# step size of 49 to fit 1372 images
predictions = convmodel.model.predict_generator(test_generator,steps=49, max_queue_size=1, workers=1, use_multiprocessing=False)

In [204]:
"/".join(test_generator.filenames[0].split('/')[:-1])

"A_Hard_Day_s_Night/A_Hard_Day's_Night"

In [205]:
df = pd.DataFrame(predictions)
df['filename'] = test_generator.filenames
df['filename'] = df.filename.map(lambda x: "/".join(x.split('/')[:-1]))
df['album_name'] = df['filename'].map(lambda x: x.split('/')[0])
df['song_name'] = df['filename'].map(lambda x: x.split('/')[1])
print(df.shape)

(1372, 11)


In [206]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,filename,album_name,song_name
0,0.003837,0.049923,0.013941,0.015356,0.001712192,0.137662,0.052743,0.724825,A_Hard_Day_s_Night/A_Hard_Day's_Night,A_Hard_Day_s_Night,A_Hard_Day's_Night
1,0.000349,0.003498,0.013912,0.007742,4.618992e-05,0.428965,0.02455,0.520938,A_Hard_Day_s_Night/A_Hard_Day's_Night,A_Hard_Day_s_Night,A_Hard_Day's_Night
2,8.9e-05,0.003549,0.001159,0.869733,5.425999e-07,0.093083,0.015772,0.016614,A_Hard_Day_s_Night/A_Hard_Day's_Night,A_Hard_Day_s_Night,A_Hard_Day's_Night
3,0.000186,0.00244,0.003561,0.018293,4.140206e-05,0.885389,0.060334,0.029757,A_Hard_Day_s_Night/A_Hard_Day's_Night,A_Hard_Day_s_Night,A_Hard_Day's_Night
4,0.089528,0.045254,0.13229,0.144059,0.01725151,0.22724,0.146206,0.198172,A_Hard_Day_s_Night/A_Hard_Day's_Night,A_Hard_Day_s_Night,A_Hard_Day's_Night


In [265]:
with open('beatles.pickle', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [242]:
recdf = df.groupby('filename').mean()
print(recdf.shape)
recdf.head()

(212, 8)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A_Hard_Day_s_Night/A_Hard_Day's_Night,0.015675,0.038802,0.029701,0.176021,0.00318,0.344165,0.070785,0.32167
A_Hard_Day_s_Night/And_I_Love_Her,0.001432,0.023466,0.031076,0.001793,0.000527,0.501843,0.062176,0.377686
A_Hard_Day_s_Night/Any_Time_At_All,0.00186,0.096625,0.207092,0.019844,0.002416,0.499649,0.140668,0.031846
A_Hard_Day_s_Night/Can't_Buy_Me_Love,0.000625,0.037014,0.045759,0.00013,0.000252,0.683018,0.173206,0.059996
A_Hard_Day_s_Night/I'll_Be_Back,0.000447,0.03128,0.175842,0.025778,0.001522,0.497182,0.175836,0.092115


Very nice. Now I have the dataframe that tells me my score for every beatles song. Let me make a copy of that to find the closest songs to each other

In [240]:
recdfeuclid = recdf.copy()
recdfeuclid.index = [x.split('/')[1] for x in recdfeuclid.index.values]
recdfeuclid.head()

Unnamed: 0,0,1,2,3,4,5,6,7
A_Hard_Day's_Night,0.015675,0.038802,0.029701,0.176021,0.00318,0.344165,0.070785,0.32167
And_I_Love_Her,0.001432,0.023466,0.031076,0.001793,0.000527,0.501843,0.062176,0.377686
Any_Time_At_All,0.00186,0.096625,0.207092,0.019844,0.002416,0.499649,0.140668,0.031846
Can't_Buy_Me_Love,0.000625,0.037014,0.045759,0.00013,0.000252,0.683018,0.173206,0.059996
I'll_Be_Back,0.000447,0.03128,0.175842,0.025778,0.001522,0.497182,0.175836,0.092115


With this function below, I found the closest neighbours to any song that is specified, and 

In [256]:
def similarityscore(song1,song2):
    ''' similarity score between songs'''
    np1 = np.array(song1)
    np2 = np.array(song2)
    difference = (np1 - np2)**2
    
    return sum(difference)



def dynamicscore(song1):
    ''' goes through dataframe to find closest songs'''
    song1 = recdfeuclid.loc[song1]
    errorlist = []
    for i in range(recdfeuclid.shape[0]):
        error = similarityscore(song1, recdfeuclid.iloc[i])
        errorlist.append((recdfeuclid.index[i], error))
    return errorlist
    
def closestsongs(name, length):
    ''' making the list. length is the number of closes songs you want to display'''
    elist = dynamicscore(name)
    return sorted(elist, key=lambda tup: tup[1])[1:length]

recommendedsongs = closestsongs("A_Hard_Day's_Night",20)
print("The 20 closest tracks to A Hard Day's Night is:")
print("")
pprint.pprint(recommendedsongs)


The 20 closest tracks to A Hard Day's Night is:

[('Nowhere_Man', 0.003638011804468988),
 ("I'll_Follow_The_Sun", 0.028758584413481003),
 ("Baby_It's_You", 0.029567901755058301),
 ('Get_Back', 0.032045870306774304),
 ('If_I_Needed_Someone', 0.032071151341369841),
 ('Strawberry_Fields_Forever', 0.03237347614049213),
 ('She_Came_In_Through_The_Bathroom_Window', 0.041557100916179479),
 ('Slow_Down', 0.043261941686978389),
 ('You_Never_Give_Me_Your_Money', 0.045129542779250187),
 ("Don't_Let_Me_Down", 0.04657285482971929),
 ('P.S._I_Love_You', 0.047582699834947562),
 ('Your_Mother_Should_Know', 0.047827463795329095),
 ('Blue_Jay_Way', 0.048820561780303251),
 ('Love_You_To', 0.048986601788328699),
 ("I'm_Only_Sleeping", 0.051808107856494701),
 ("Honey_Don't", 0.053818070183012878),
 ('I_Call_Your_Name', 0.054180807021566579),
 ('I_Need_You', 0.054755451878691019),
 ("She's_Leaving_Home", 0.055124233166594649)]


I decided to use the [T-distributed Stochastic Neighbour Embedding](http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html) on sklearn to reduce the dimensionality of the data to 2D for visualisation for the distances between songs. I then used [plotly](https://plot.ly) to plot a scatter plot of the T-SNE results with hover text.

In [214]:
tsne = TSNE(n_components=2)
recdf_embedded = tsne.fit_transform(recdf.values)
recdf_embedded = pd.DataFrame(recdf_embedded)
recdf_embedded['filename'] = recdf.index
recdf_embedded['album_name'] = recdf_embedded['filename'].map(lambda x: x.split('/')[0])
recdf_embedded['song_name'] = recdf_embedded['filename'].map(lambda x: x.split('/')[1])
recdf_embedded = recdf_embedded.rename(columns={0:'0',1:'1'})

In [215]:
recdf_embedded.head()

Unnamed: 0,0,1,filename,album_name,song_name
0,2.794863,5.215832,A_Hard_Day_s_Night/A_Hard_Day's_Night,A_Hard_Day_s_Night,A_Hard_Day's_Night
1,3.753482,7.24225,A_Hard_Day_s_Night/And_I_Love_Her,A_Hard_Day_s_Night,And_I_Love_Her
2,9.012121,3.651296,A_Hard_Day_s_Night/Any_Time_At_All,A_Hard_Day_s_Night,Any_Time_At_All
3,8.331081,5.949312,A_Hard_Day_s_Night/Can't_Buy_Me_Love,A_Hard_Day_s_Night,Can't_Buy_Me_Love
4,8.558709,4.099196,A_Hard_Day_s_Night/I'll_Be_Back,A_Hard_Day_s_Night,I'll_Be_Back


In [216]:
recdf_embedded['album_name'].unique()[8]

'Please_Please_Me'

In [221]:
# Sorted the albums in chronological order
chronological_albums = [recdf_embedded['album_name'].unique()[8],
                                  recdf_embedded['album_name'].unique()[0],
                                  recdf_embedded['album_name'].unique()[2],
                                  recdf_embedded['album_name'].unique()[3],
                                  recdf_embedded['album_name'].unique()[10],
                                  recdf_embedded['album_name'].unique()[9],
                                  recdf_embedded['album_name'].unique()[11],
                                  recdf_embedded['album_name'].unique()[5],
                                  recdf_embedded['album_name'].unique()[12],
                                  recdf_embedded['album_name'].unique()[13],
                                  recdf_embedded['album_name'].unique()[14],
                                  recdf_embedded['album_name'].unique()[1],
                                  recdf_embedded['album_name'].unique()[4],
                                  recdf_embedded['album_name'].unique()[6],
                                  recdf_embedded['album_name'].unique()[7]]

In [222]:
print(chronological_albums)

['Please_Please_Me', 'A_Hard_Day_s_Night', 'Beatles_For_Sale', 'Help!', 'Rubber_Soul', 'Revolver', 'Sgt_Peppers_Lonely_Hearts_Club_Band', 'Magical_Mystery_Tour', 'The_Beatles_[White_Album]_CD1', 'The_Beatles_[White_Album]_CD2', 'Yellow_Submarine', 'Abbey_Road', 'Let_It_Be', 'Past_Masters_I', 'Past_Masters_II']


In [264]:
N = recdf_embedded.nunique()[3]
c= ['hsl(0,100%,{}%)'.format(h) for h in np.linspace(10, 100, N)] #colour coding for the plot
print(c)

['hsl(0,100%,10.0%)', 'hsl(0,100%,16.42857142857143%)', 'hsl(0,100%,22.857142857142858%)', 'hsl(0,100%,29.285714285714285%)', 'hsl(0,100%,35.714285714285715%)', 'hsl(0,100%,42.142857142857146%)', 'hsl(0,100%,48.57142857142857%)', 'hsl(0,100%,55.0%)', 'hsl(0,100%,61.42857142857143%)', 'hsl(0,100%,67.85714285714286%)', 'hsl(0,100%,74.28571428571429%)', 'hsl(0,100%,80.71428571428572%)', 'hsl(0,100%,87.14285714285714%)', 'hsl(0,100%,93.57142857142857%)', 'hsl(0,100%,100.0%)']


In [269]:
init_notebook_mode(connected=True)

In [270]:
def tracer(colour,albumname):
    ''' Uses plotly to plot the interactive chart'''
    tracer = go.Scattergl(
    x = recdf_embedded.loc[recdf_embedded['album_name']==albumname,'0'],
    y = recdf_embedded.loc[recdf_embedded['album_name']==albumname,'1'],
    name = albumname,
    mode = 'markers',
    marker = dict(
        size = 10,
        color = colour,
        line = dict(
            width = 2,
            color = 'rgb(0, 0, 0)'
        )
    ), text=recdf_embedded.loc[recdf_embedded['album_name']==albumname,'song_name']
    )
    return tracer

data = [tracer(c[ind],albumname) for ind,albumname in enumerate(chronological_albums)]



layout = dict(title = 'A Visualisation of Songs by The Beatles',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='beatles_viz')

The results are pretty similar to the euclidian recommendation.

[link to visualisation](https://plot.ly/~melvinperera/2)