In [1]:
import os
import glob
import pickle
import numpy as np
import pandas as pd
from yellowbrick.cluster import SilhouetteVisualizer

from sentence_transformers import SentenceTransformer, util

import warnings
warnings.filterwarnings("ignore")

In [None]:
# load data
path = '../songsdata'
all_files = glob.glob(os.path.join(path, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [None]:
df.shape

In [None]:
df.head(3)

In [None]:
df.info()

#### Data Cleaning
based on the lyrics we got, we found there are some text which are not lyrics contained in the Genius API database. Hence, we need to filter out those text. We found that we might be able to filter through line length, since songs line would not be too long to fit with the song tempo. The songs length are normally within the similar length as well. So we filtered out with average line length and songs' length that is over 1 standard deviation.

In [None]:
# data cleaning
df.lyrics = df.lyrics.astype(str)
df['lyrics_list'] = df.lyrics.apply(lambda x: x.splitlines())

In [None]:
pre_df = df.copy()

In [None]:
# function to find average line length in song
def avg_length(lyrics_list):
    avg_len = np.mean([len(line.split()) for line in lyrics_list])
    return avg_len
    # for line in lyrics_list:
    #     l = line.split()
    #     length.append(len(l))

In [None]:
pre_df['line_len'] = pre_df.lyrics_list.apply(lambda x : avg_length(x))
pre_df['song_len'] = pre_df.lyrics_list.apply(lambda x : len(x))

In [None]:
pre_df = pre_df[((pre_df.line_len - pre_df.line_len.mean())/pre_df.line_len.std()).abs() < 1]
pre_df = pre_df[((pre_df.song_len - pre_df.song_len.mean())/pre_df.song_len.std()).abs() < 1]

In [None]:
pre_df.shape

In [None]:
pre_df.head()

In [None]:
pre_df.isnull().sum()/len(pre_df)

In [None]:
# percent dropna
len(pre_df.dropna())/len(pre_df)*100

In [None]:
cleaned_df = pre_df.dropna()

In [None]:
cleaned_df.reset_index(inplace=True, drop=True)

In [None]:
cleaned_df.shape

In [None]:
cleaned_df.to_csv('../cleaned_songs.csv', index=False)

In [None]:
len(cleaned_df)/len(df)*100

#### Split the data for fine-tuning pair and test embeddings

In [None]:
# shuffle then split the data in half so we can get the distribution of songs evenly amongst different artists and styles.
from sklearn.model_selection import train_test_split
X = cleaned_df
train, test = train_test_split(X, test_size=0.5, random_state=42)

In [None]:
# sampling songs from train and test sets
sample_train = train.sample(n=1000, random_state=42) # for fine-tuning the model
sample_test = test.sample(n=3000, random_state=42) # for testing the model

_pickle = [sample_train, sample_test]

In [None]:
with open('../App_stcloud/pickle_objects/train_test.pickle', 'wb') as f:
    pickle.dump(_pickle, f)

In [None]:
sample_train.shape

In [None]:
sample_test.shape