In [1]:
import os
import glob
import pickle
import numpy as np
import pandas as pd
from yellowbrick.cluster import SilhouetteVisualizer

from sentence_transformers import SentenceTransformer, util

import warnings
warnings.filterwarnings("ignore")



In [2]:
# load data
path = '../songsdata'
all_files = glob.glob(os.path.join(path, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [3]:
df.shape

(100000, 18)

In [4]:
df.head(3)

Unnamed: 0,artist,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration,time_signature,lyrics,genius_id,annotation
0,A Few Good Men,Tonite,0.615,0.673,5.0,-8.501,0.0,0.115,0.0429,0.0,0.261,0.831,178.124,254360.0,4.0,Tonite LyricsYou know I really want to love yo...,1382268.0,[]
1,A Few Good Men,Have I Never,0.717,0.377,4.0,-8.68,1.0,0.0288,0.282,0.0,0.138,0.161,120.036,336533.0,4.0,Have I Never LyricsHave I never told you I lov...,1187594.0,[]
2,A Few Good Men,Don't Cry (Behind My Back),0.774,0.691,10.0,-6.815,1.0,0.0403,0.0827,0.0,0.0736,0.85,134.05,308267.0,4.0,"Don’t Cry (Behind My Back) LyricsDon't cry, do...",897142.0,[]


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist            98984 non-null  object 
 1   title             98895 non-null  object 
 2   danceability      98982 non-null  float64
 3   energy            98982 non-null  float64
 4   key               98982 non-null  float64
 5   loudness          98982 non-null  float64
 6   mode              98982 non-null  float64
 7   speechiness       98982 non-null  float64
 8   acousticness      98982 non-null  float64
 9   instrumentalness  98982 non-null  float64
 10  liveness          98982 non-null  float64
 11  valence           98982 non-null  float64
 12  tempo             98982 non-null  float64
 13  duration          98982 non-null  float64
 14  time_signature    98982 non-null  float64
 15  lyrics            94703 non-null  object 
 16  genius_id         94653 non-null  float

#### Data Cleaning
based on the lyrics we got, we found there are some text which are not lyrics contained in the Genius API database. Hence, we need to filter out those text. We found that we might be able to filter through line length, since songs line would not be too long to fit with the song tempo. The songs length are normally within the similar length as well. So we filtered out with average line length and songs' length that is over 1 standard deviation.

In [6]:
# data cleaning
df.lyrics = df.lyrics.astype(str)
df['lyrics_list'] = df.lyrics.apply(lambda x: x.splitlines())

In [7]:
pre_df = df.copy()

In [8]:
# function to find average line length in song
def avg_length(lyrics_list):
    avg_len = np.mean([len(line.split()) for line in lyrics_list])
    return avg_len
    # for line in lyrics_list:
    #     l = line.split()
    #     length.append(len(l))

In [9]:
pre_df['line_len'] = pre_df.lyrics_list.apply(lambda x : avg_length(x))
pre_df['song_len'] = pre_df.lyrics_list.apply(lambda x : len(x))

In [10]:
pre_df = pre_df[((pre_df.line_len - pre_df.line_len.mean())/pre_df.line_len.std()).abs() < 1]
pre_df = pre_df[((pre_df.song_len - pre_df.song_len.mean())/pre_df.song_len.std()).abs() < 1]

In [11]:
pre_df.shape

(96155, 21)

In [12]:
pre_df.head()

Unnamed: 0,artist,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,valence,tempo,duration,time_signature,lyrics,genius_id,annotation,lyrics_list,line_len,song_len
0,A Few Good Men,Tonite,0.615,0.673,5.0,-8.501,0.0,0.115,0.0429,0.0,...,0.831,178.124,254360.0,4.0,Tonite LyricsYou know I really want to love yo...,1382268.0,[],[Tonite LyricsYou know I really want to love y...,6.666667,66
1,A Few Good Men,Have I Never,0.717,0.377,4.0,-8.68,1.0,0.0288,0.282,0.0,...,0.161,120.036,336533.0,4.0,Have I Never LyricsHave I never told you I lov...,1187594.0,[],[Have I Never LyricsHave I never told you I lo...,8.196078,51
2,A Few Good Men,Don't Cry (Behind My Back),0.774,0.691,10.0,-6.815,1.0,0.0403,0.0827,0.0,...,0.85,134.05,308267.0,4.0,"Don’t Cry (Behind My Back) LyricsDon't cry, do...",897142.0,[],"[Don’t Cry (Behind My Back) LyricsDon't cry, d...",5.371429,70
3,A Flock Of Seagulls,Modern Love Is Automatic,0.323,0.821,7.0,-6.245,1.0,0.109,0.0444,0.979,...,0.192,162.7,230200.0,4.0,Modern Love Is Automatic LyricsShe's an automa...,969269.0,[],[Modern Love Is Automatic LyricsShe's an autom...,3.147059,34
4,A Flock Of Seagulls,D.N.A.,0.477,0.746,0.0,-5.487,1.0,0.0322,0.000136,0.925,...,0.818,151.741,150227.0,4.0,,,,[nan],1.0,1


In [13]:
pre_df.isnull().sum()/len(pre_df)

artist              0.010566
title               0.011492
danceability        0.010587
energy              0.010587
key                 0.010587
loudness            0.010587
mode                0.010587
speechiness         0.010587
acousticness        0.010587
instrumentalness    0.010587
liveness            0.010587
valence             0.010587
tempo               0.010587
duration            0.010587
time_signature      0.010587
lyrics              0.000000
genius_id           0.055577
annotation          0.055577
lyrics_list         0.000000
line_len            0.000000
song_len            0.000000
dtype: float64

In [14]:
# percent dropna
len(pre_df.dropna())/len(pre_df)*100

94.34974780302636

In [15]:
cleaned_df = pre_df.dropna()

In [16]:
cleaned_df.reset_index(inplace=True, drop=True)

In [17]:
cleaned_df.shape

(90722, 21)

In [18]:
cleaned_df.to_csv('../cleaned_songs.csv', index=False)

In [19]:
len(cleaned_df)/len(df)*100

90.72200000000001

#### Split the data for fine-tuning pair and test embeddings

In [20]:
# shuffle then split the data in half so we can get the distribution of songs evenly amongst different artists and styles.
from sklearn.model_selection import train_test_split
X = cleaned_df
train, test = train_test_split(X, test_size=0.5, random_state=42)

In [21]:
# sampling songs from train and test sets
sample_train = train.sample(n=1000, random_state=42)
sample_test = test.sample(n=1000, random_state=42)

_pickle = [sample_train, sample_test]

In [22]:
with open('../App/pickle_objects/train_test.pickle', 'wb') as f:
    pickle.dump(_pickle, f)

In [23]:
sample_train.shape

(1000, 21)

In [24]:
sample_test.shape

(1000, 21)