# Import liabraries

In [1]:
!pip install spotipy



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import skimage
from skimage import io

# Data Cleaning and Preprocessing

In [3]:
def clean_dataframe(csv_file):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Drop the "Unnamed: 0" column
    df = df.drop("Unnamed: 0", axis=1)

    # Check for missing values
    print(df.isna().sum())

    # Drop missing values
    df = df.dropna()

    # Drop duplicate rows
    df = df.drop_duplicates()

    # Keep only the first occurrence of each track name
    df.drop_duplicates(subset=['track_name'], keep='first', inplace=True)

    # Convert the 'explicit' column to int type
    df['explicit'] = df['explicit'].astype(int)

    return df

In [4]:
df = clean_dataframe("dataset.csv")

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


In [5]:
df = clean_dataframe("dataset.csv")
random_state = 42
sample_df = df.sample(n=10000, random_state=random_state)
sample_df.reset_index(drop=True, inplace=True)
sample_df

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,2gpz5SQb1NEPPxjbSglAZK,Pappo,Buscando un Amor,Trabajo Forzado,23,248066,0,0.547,0.6990,9,-4.634,1,0.0357,0.021300,0.008350,0.1310,0.622,75.814,4,heavy-metal
1,3oYj1UaQtkX0R9iuaV2jMj,Eterna Inocencia,No bien abran las flores,Hojas Amarillas,31,255905,0,0.576,0.7090,2,-5.722,1,0.0327,0.005140,0.000087,0.1250,0.579,140.003,4,hardcore
2,2WkjVlKU47qVKyPUehA5eH,SONNY NITEZ,MORNING 2007,MORNING 2007,52,118763,1,0.752,0.4850,5,-10.243,1,0.0432,0.255000,0.000000,0.3420,0.819,95.003,4,sad
3,4XBcShO4h4X55x9yMmyKHL,Allman Brown,1000 Years,Sweetest Thing,54,218502,0,0.546,0.4110,2,-9.308,1,0.0475,0.701000,0.000097,0.0955,0.528,79.785,4,acoustic
4,7pvMOx4cYIamyNeYLVhTWS,Tren Loco,Venas De Acero,Venas De Acero,25,362520,0,0.534,0.9490,4,-3.948,1,0.0665,0.001020,0.000177,0.5370,0.334,114.988,4,heavy-metal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1IrPSbGITPaMwMMEXfb2DJ,Ali Farka Touré;Toumani Diabaté,Ali & Toumani,Be Mankan,9,306960,0,0.583,0.2060,4,-12.297,1,0.0416,0.900000,0.380000,0.1230,0.483,123.362,3,afrobeat
9996,6r7FXNO57mlZCBY6PXcZZT,Deorro,Five Hours,Five Hours,61,326887,0,0.880,0.8210,5,-2.545,0,0.1180,0.019700,0.661000,0.2340,0.575,127.907,4,progressive-house
9997,7bu0znpSbTks0O6I98ij0W,The Smashing Pumpkins,Mellon Collie And The Infinite Sadness (Deluxe...,"Tonight, Tonight - Remastered 2012",69,254626,0,0.277,0.6700,6,-9.381,1,0.0376,0.000063,0.751000,0.2370,0.217,148.342,4,grunge
9998,1LZik4eONOZd2wuvHVdbmz,Bebe Neuwirth,Chicago The Musical (New Broadway Cast Recordi...,I Can't Do It Alone (Reprise),24,40533,0,0.334,0.0551,10,-18.918,1,0.0491,0.862000,0.000000,0.1190,0.448,164.618,1,show-tunes


# Recommendation System

## Feature Generation
One-hot Encoding  
Normalization

In [6]:
def one_hot_encoding(df, column, new_name): 
    ''' 
    Create One Hot Encoded features of a specific column
    ---
    Input: 
    df (pandas dataframe): Spotify Dataframe
    column (str): Column to be processed
    new_name (str): new column name to be used
        
    Output: 
    tf_df: One-hot encoded features 
    '''
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

In [7]:
def create_feature_set(df):
    '''
    Process spotify df to create a final set of features that will be used to generate recommendations
    ---
    Input: 
    df (pandas dataframe): Spotify Dataframe
            
    Output: 
    final (pandas dataframe): Final set of features 
    '''

    # One-hot Encoding
    genre_ohe = one_hot_encoding(df, 'track_genre', 'genre_encode')

    # Normalization
    feature_cols = ['popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 
                'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 
                'liveness', 'valence', 'tempo', 'time_signature']
    scaler = MinMaxScaler()
    normalized_df =scaler.fit_transform(df[feature_cols])
    normalized_df = pd.DataFrame(normalized_df, columns=feature_cols)

    # Concatenate all features
    final = pd.concat([normalized_df, genre_ohe], axis=1)
    
    return final

In [8]:
complete_feature_set = create_feature_set(sample_df)
complete_feature_set.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,...,genre_encode|spanish,genre_encode|study,genre_encode|swedish,genre_encode|synth-pop,genre_encode|tango,genre_encode|techno,genre_encode|trance,genre_encode|trip-hop,genre_encode|turkish,genre_encode|world-music
0,0.247312,0.053714,0.0,0.557594,0.699,0.818182,0.809185,1.0,0.037226,0.021386,...,False,False,False,False,False,False,False,False,False,False
1,0.333333,0.055569,0.0,0.587156,0.709,0.181818,0.786535,1.0,0.034098,0.005161,...,False,False,False,False,False,False,False,False,False,False
2,0.55914,0.02311,1.0,0.766565,0.485,0.454545,0.692418,1.0,0.045047,0.256024,...,False,False,False,False,False,False,False,False,False,False
3,0.580645,0.046717,0.0,0.556575,0.411,0.181818,0.711883,1.0,0.049531,0.703815,...,False,False,False,False,False,False,False,False,False,False
4,0.268817,0.080803,0.0,0.544343,0.949,0.363636,0.823466,1.0,0.069343,0.001024,...,False,False,False,False,False,False,False,False,False,False


## content_based recommendation system
cosine similarity

In [9]:
# Create a pandas series with song titles as indices and indices as series values 
indices = pd.Series(sample_df.index, index=sample_df['track_name']).drop_duplicates()

# Create cosine similarity matrix based on given matrix
cosine = cosine_similarity(complete_feature_set)

In [10]:
def generate_recommendation(track_name, model_type):
    index=indices[track_name]
    score=list(enumerate(model_type[index]))
    similarity_score = sorted(score,key = lambda x:x[1],reverse = True)
    similarity_score = similarity_score[1:11]
    top_songs_index = [i[0] for i in similarity_score]
    top_songs=sample_df['track_name'].iloc[top_songs_index]
    return top_songs

In [11]:
# Song Recommendations using Cosine Similarity
print("Recommended Songs:")
print(generate_recommendation('Tearz',cosine).values)

Recommended Songs:
['Todo se transforma' 'A Vida É Curta Pra Viver Depois' 'Cria de Favela'
 'Happy' 'I Do It' "Eye Nyam Nam 'A' Mensuro - Henrik Schwarz Blend"
 'Princesa bacana' 'Samba Sambei' 'Quiero Desintegrar a Tu Novio'
 'La Carga']


## spotify api

In [12]:
with open('spotify_api.txt', 'r') as file:
    client_id = file.readline().strip()
    client_secret = file.readline().strip()

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## output df

In [13]:
def create_outputs(recommend_playlist_name):
    playlist = pd.DataFrame()
    for name in recommend_playlist_name:
        results = sp.search(q=name, type='track', limit=1)
        if results['tracks']['total'] > 0:
            for ix, i in enumerate(results['tracks']['items']):
                playlist = playlist.append({
                    'artist': i['artists'][0]['name'],
                    'name': i['name'],
                    'id': i['id'],
                    'url': i['album']['images'][1]['url']
                }, ignore_index=True)
        else:
            print('No results found for track:', name)
    return playlist

In [14]:
recommended_songs = generate_recommendation('Tearz',cosine).values
df_recommended_songs = create_outputs(recommended_songs)
df_recommended_songs

AttributeError: 'DataFrame' object has no attribute 'append'

## visulization

In [None]:
def visualize_songs(df):
    """ 
    Visualize cover art of the songs in the inputted dataframe

    Parameters: 
        df (pandas dataframe): Playlist Dataframe
    """
    
    temp = df['url'].values
    plt.figure(figsize=(15,int(0.625 * len(temp))))
    columns = 5
    
    for i, url in enumerate(temp):
        plt.subplot(int(len(temp) / columns) + 1, columns, i + 1)

        image = io.imread(url)
        plt.imshow(image)
        plt.xticks(color = 'w', fontsize = 0.1)
        plt.yticks(color = 'w', fontsize = 0.1)
        plt.xlabel(df['name'].values[i], fontsize = 12)
        plt.tight_layout(h_pad=0.4, w_pad=0)
        plt.subplots_adjust(wspace=None, hspace=None)
    
    plt.show()

In [None]:
visualize_songs(df_recommended_songs)