In [2]:
# similarity search
import pandas as pd

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import sklearn.metrics.pairwise as pw
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
df = pd.read_csv('all_tracks.csv')
df

Unnamed: 0,trackName,artistName,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature
0,"""In The Hall Of The Mountain King"" from Peer G...",London Symphony Orchestra,Classical,0.475,0.130,7,-17.719,1,0.0510,0.9160,0.956000,0.1010,0.122,112.241,14Qcrx6Dfjvcj0H8oV8oUW,150827,4
1,#BrooklynBloodPop!,SyKo,Electronic/Dance Music,0.691,0.814,1,-3.788,0,0.1170,0.0164,0.000000,0.3660,0.509,132.012,7K9Z3yFNNLv5kwTjQYGjnu,145611,4
2,$10,Good Morning,Pop,0.624,0.596,4,-9.804,1,0.0314,0.4750,0.203000,0.1190,0.896,120.969,3koAwrM1RO0TGMeQJ3qt9J,89509,4
3,(I Just) Died In Your Arms,Cutting Crew,Rock,0.625,0.726,11,-11.402,0,0.0444,0.0158,0.000169,0.0625,0.507,124.945,4ByEFOBuLXpCqvO1kw8Wdm,280400,4
4,"...And to Those I Love, Thanks for Sticking Ar...",$uicideboy$,Rap/Hip Hop,0.792,0.511,2,-6.876,1,0.0409,0.1240,0.000090,0.1400,0.111,113.983,30QR0ndUdiiMQMA9g1PGCm,168490,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Cool Down,Kolohe Kai,Reggae,0.911,0.722,10,-3.967,1,0.0389,0.1250,0.000000,0.0922,0.861,140.032,6uJaTP7EbaHXJ5PM09s0uV,211147,4
9996,Fire,"Soul Brown, Kabin Bread Boyz",Reggae,0.718,0.789,7,-5.877,1,0.0346,0.1280,0.000000,0.0882,0.693,104.998,2DyRPkBu8gHPdhxnUjQ86D,244492,4
9997,Coming Home,PAUA,Reggae,0.894,0.413,7,-7.931,1,0.0480,0.0182,0.000008,0.2610,0.706,120.942,0b3MO0oAXgxpUUh1idkBcW,331573,4
9998,Party,House Of Shem,Reggae,0.845,0.537,0,-6.897,1,0.1480,0.2140,0.000000,0.1020,0.649,143.997,0AHbwnfwocPdIN9CvQCYKr,277037,4


In [9]:
# Create a pipeline to preprocess the data
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Fit the pipeline to the data
X = preprocessor.fit_transform(df)

# Calculate the cosine similarity
sim_matrix = cosine_similarity(X, X)

# Create a DataFrame from the similarity matrix
sim_df = pd.DataFrame(sim_matrix, index=df.index, columns=df.index)
sim_df

track_names = df['trackName'].values.tolist()


# function to get the most similar tracks
def get_similar_tracks(track_name, sim_df):
    df = pd.read_csv('all_tracks.csv')
    track_names = df['trackName'].values.tolist()
    results = []
    track_idx = track_names.index(track_name)
    sim_tracks = sim_df.iloc[track_idx].sort_values(ascending=False)
    sim_tracks = sim_tracks.drop(track_idx)
    sim_tracks = sim_tracks.head(5)
    for idx, score in sim_tracks.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_artists(artist_name, sim_df):
    df = pd.read_csv('all_tracks.csv')
    artist_names = df['artistName'].values.tolist()
    results = []
    artist_idx = artist_names.index(artist_name)
    sim_artists = sim_df.iloc[artist_idx].sort_values(ascending=False)
    sim_artists = sim_artists.drop(artist_idx)
    sim_artists = sim_artists.head(5)
    for idx, score in sim_artists.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_genres(genre, sim_df):
    df = pd.read_csv('all_tracks.csv')
    genre_names = df['genre'].values.tolist()
    results = []
    genre_idx = genre_names.index(genre)
    sim_genres = sim_df.iloc[genre_idx].sort_values(ascending=False)
    sim_genres = sim_genres.drop(genre_idx)
    sim_genres = sim_genres.head(5)
    for idx, score in sim_genres.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_albums(album, sim_df):
    df = pd.read_csv('all_tracks.csv')
    album_names = df['albumName'].values.tolist()
    results = []
    album_idx = album_names.index(album)
    sim_albums = sim_df.iloc[album_idx].sort_values(ascending=False)
    sim_albums = sim_albums.drop(album_idx)
    sim_albums = sim_albums.head(5)
    for idx, score in sim_albums.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_playlists(playlist, sim_df):
    df = pd.read_csv('all_tracks.csv')
    playlist_names = df['playlistName'].values.tolist()
    results = []
    playlist_idx = playlist_names.index(playlist)
    sim_playlists = sim_df.iloc[playlist_idx].sort_values(ascending=False)
    sim_playlists = sim_playlists.drop(playlist_idx)
    sim_playlists = sim_playlists.head(5)
    for idx, score in sim_playlists.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_years(year, sim_df):
    df = pd.read_csv('all_tracks.csv')
    year_names = df['year'].values.tolist()
    results = []
    year_idx = year_names.index(year)
    sim_years = sim_df.iloc[year_idx].sort_values(ascending=False)
    sim_years = sim_years.drop(year_idx)
    sim_years = sim_years.head(5)
    for idx, score in sim_years.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_durations(duration, sim_df):
    df = pd.read_csv('all_tracks.csv')
    duration_names = df['duration_ms'].values.tolist()
    results = []
    duration_idx = duration_names.index(duration)
    sim_durations = sim_df.iloc[duration_idx].sort_values(ascending=False)
    sim_durations = sim_durations.drop(duration_idx)
    sim_durations = sim_durations.head(5)
    for idx, score in sim_durations.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_popularities(popularity, sim_df):
    df = pd.read_csv('all_tracks.csv')
    popularity_names = df['popularity'].values.tolist()
    results = []
    popularity_idx = popularity_names.index(popularity)
    sim_popularities = sim_df.iloc[popularity_idx].sort_values(ascending=False)
    sim_popularities = sim_popularities.drop(popularity_idx)
    sim_popularities = sim_popularities.head(5)
    for idx, score in sim_popularities.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_acousticness(acousticness, sim_df):
    df = pd.read_csv('all_tracks.csv')
    acousticness_names = df['acousticness'].values.tolist()
    results = []
    acousticness_idx = acousticness_names.index(acousticness)
    sim_acousticness = sim_df.iloc[acousticness_idx].sort_values(ascending=False)
    sim_acousticness = sim_acousticness.drop(acousticness_idx)
    sim_acousticness = sim_acousticness.head(5)
    for idx, score in sim_acousticness.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_danceability(danceability, sim_df):
    df = pd.read_csv('all_tracks.csv')
    danceability_names = df['danceability'].values.tolist()
    results = []
    danceability_idx = danceability_names.index(danceability)
    sim_danceability = sim_df.iloc[danceability_idx].sort_values(ascending=False)
    sim_danceability = sim_danceability.drop(danceability_idx)
    sim_danceability = sim_danceability.head(5)
    for idx, score in sim_danceability.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_energy(energy, sim_df):
    df = pd.read_csv('all_tracks.csv')
    energy_names = df['energy'].values.tolist()
    results = []
    energy_idx = energy_names.index(energy)
    sim_energy = sim_df.iloc[energy_idx].sort_values(ascending=False)
    sim_energy = sim_energy.drop(energy_idx)
    sim_energy = sim_energy.head(5)
    for idx, score in sim_energy.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_instrumentalness(instrumentalness, sim_df):
    df = pd.read_csv('all_tracks.csv')
    instrumentalness_names = df['instrumentalness'].values.tolist()
    results = []
    instrumentalness_idx = instrumentalness_names.index(instrumentalness)
    sim_instrumentalness = sim_df.iloc[instrumentalness_idx].sort_values(ascending=False)
    sim_instrumentalness = sim_instrumentalness.drop(instrumentalness_idx)
    sim_instrumentalness = sim_instrumentalness.head(5)
    for idx, score in sim_instrumentalness.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_liveness(liveness, sim_df):
    df = pd.read_csv('all_tracks.csv')
    liveness_names = df['liveness'].values.tolist()
    results = []
    liveness_idx = liveness_names.index(liveness)
    sim_liveness = sim_df.iloc[liveness_idx].sort_values(ascending=False)
    sim_liveness = sim_liveness.drop(liveness_idx)
    sim_liveness = sim_liveness.head(5)
    for idx, score in sim_liveness.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_loudness(loudness, sim_df):
    df = pd.read_csv('all_tracks.csv')
    loudness_names = df['loudness'].values.tolist()
    results = []
    loudness_idx = loudness_names.index(loudness)
    sim_loudness = sim_df.iloc[loudness_idx].sort_values(ascending=False)
    sim_loudness = sim_loudness.drop(loudness_idx)
    sim_loudness = sim_loudness.head(5)
    for idx, score in sim_loudness.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_speechiness(speechiness, sim_df):
    df = pd.read_csv('all_tracks.csv')
    speechiness_names = df['speechiness'].values.tolist()
    results = []
    speechiness_idx = speechiness_names.index(speechiness)
    sim_speechiness = sim_df.iloc[speechiness_idx].sort_values(ascending=False)
    sim_speechiness = sim_speechiness.drop(speechiness_idx)
    sim_speechiness = sim_speechiness.head(5)
    for idx, score in sim_speechiness.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_tempo(tempo, sim_df):
    df = pd.read_csv('all_tracks.csv')
    tempo_names = df['tempo'].values.tolist()
    results = []
    tempo_idx = tempo_names.index(tempo)
    sim_tempo = sim_df.iloc[tempo_idx].sort_values(ascending=False)
    sim_tempo = sim_tempo.drop(tempo_idx)
    sim_tempo = sim_tempo.head(5)
    for idx, score in sim_tempo.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

def get_similar_valence(valence, sim_df):
    df = pd.read_csv('all_tracks.csv')
    valence_names = df['valence'].values.tolist()
    results = []
    valence_idx = valence_names.index(valence)
    sim_valence = sim_df.iloc[valence_idx].sort_values(ascending=False)
    sim_valence = sim_valence.drop(valence_idx)
    sim_valence = sim_valence.head(5)
    for idx, score in sim_valence.items():
        results.append({
            'trackName': df.iloc[idx]['trackName'],
            'artistName': df.iloc[idx]['artistName'],
            'similarity': score
        })
    return results

In [16]:
# test the function
get_similar_tracks('The Box', sim_df)

[{'trackName': 'Ego Death (feat. Kanye West, FKA twigs & Skrillex)',
  'artistName': 'Ty Dolla $ign',
  'similarity': 0.8509783837086426},
 {'trackName': 'Obsessed',
  'artistName': 'Mariah Carey',
  'similarity': 0.8404544824040168},
 {'trackName': 'Party On My Mind',
  'artistName': 'KK',
  'similarity': 0.8352338361188558},
 {'trackName': 'Gangsta Nation',
  'artistName': 'Westside Connection',
  'similarity': 0.8315412222624571},
 {'trackName': 'Good Goodbye (feat. Pusha T and Stormzy)',
  'artistName': 'Linkin Park',
  'similarity': 0.8313713837528154}]

In [12]:
# Get the most similar tracks to 'Shape of You' by Ed Sheeran
get_similar_tracks('Shape of You', sim_df)

[{'trackName': 'Guitar Sikhda',
  'artistName': 'Jassie Gill',
  'similarity': 0.705971341836273},
 {'trackName': 'How Long',
  'artistName': 'Charlie Puth',
  'similarity': 0.7049208855787651},
 {'trackName': 'Know Me Too Well',
  'artistName': 'New Hope Club',
  'similarity': 0.7037148232099697},
 {'trackName': "Stop Callin' Me - Radio Edit",
  'artistName': 'Shakaya',
  'similarity': 0.6940196901274142},
 {'trackName': 'Love You Like A Love Song',
  'artistName': 'Selena Gomez & The Scene',
  'similarity': 0.6930565226451737}]

In [13]:
# Get the most similar tracks to 'The Less I Know The Better' by Tame Impala
get_similar_artists('Tame Impala', sim_df)

[{'trackName': "Beggin'",
  'artistName': 'Måneskin',
  'similarity': 0.55445878379639},
 {'trackName': 'Southern Palm (Your Lifeline)',
  'artistName': 'PETER LAKE',
  'similarity': 0.5167007964017288},
 {'trackName': 'Hands On You',
  'artistName': 'Austin George',
  'similarity': 0.5123517691282937},
 {'trackName': 'Are You Gonna Go My Way',
  'artistName': 'Lenny Kravitz',
  'similarity': 0.5120620115820766},
 {'trackName': 'June Gloom',
  'artistName': 'RiSi',
  'similarity': 0.5030228331787167}]

In [15]:
get_similar_genres('Pop', sim_df)

[{'trackName': '29',
  'artistName': 'Chris James',
  'similarity': 0.6369298092392782},
 {'trackName': 'Chaand Baaliyan',
  'artistName': 'Aditya A',
  'similarity': 0.6294556131674326},
 {'trackName': 'Cafe at last',
  'artistName': 'RADWIMPS',
  'similarity': 0.6042717017500187},
 {'trackName': 'not that inna you',
  'artistName': 'cookii',
  'similarity': 0.577522718085524},
 {'trackName': 'Pain',
  'artistName': 'PinkPantheress',
  'similarity': 0.5701283262652322}]

In [6]:
# save to pickle
import pickle

with open('similarity.pkl', 'wb') as f:
    pickle.dump(sim_df, f)
    
# load from pickle
with open('similarity.pkl', 'rb') as f:
    sim_df = pickle.load(f)
    

In [7]:
# save all track names to a file
track_names = df['trackName'].values.tolist()
with open('track_names.pkl', 'wb') as f:
    pickle.dump(track_names, f)
    
# load track names from file
with open('track_names.pkl', 'rb') as f:
    track_names = pickle.load(f)


In [16]:
# pickling the preprocessor
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)
    
# load the preprocessor
with open('preprocessor.pkl', 'rb') as f:
    preprocessor = pickle.load(f)