# Song recommender

## imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import config

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from matplotlib import pyplot
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
# extract song names from top 100
def get_songs(containers):
    songs = []
    i = 0
    while i < len(containers):
        song = containers[i].find('h3').get_text()
        songs.append(song.replace('\n', '').replace('\t', ''))
        i += 1
    return songs


In [3]:
# extract artists from top 100
def get_artists(containers):
    artists = []
    i = 0
    while i < len(containers):
        artist = containers[i].find(
            'span', {'class': 'a-no-trucate'}).get_text()
        artists.append(artist.replace('\n', '').replace('\t', ''))
        i += 1
    return artists

In [4]:
# exe
url = 'https://www.billboard.com/charts/hot-100/'
res = requests.get(url)

soup = BeautifulSoup(res.text, 'html.parser')
top100 = soup.find_all('div', {'class': 'o-chart-results-list-row-container'})

artists = get_artists(top100)
songs = get_songs(top100)

top100_df = pd.DataFrame(zip(artists, songs), columns=['Artist', 'Song'])
top100_df.index = np.arange(1, len(top100_df) + 1)


In [5]:
def recommender(df):
    artist = input('Artist name: ').lower()
    song = input('Song name: ').lower()

    lower_df = df.applymap(lambda x: x.lower())
    songRow = lower_df.loc[(lower_df['Artist'] == artist)
                           & (lower_df['Song'] == song)]

    if not songRow.empty:
        exclude_index = songRow.index
        rec_index = np.random.choice(
            [i for i in range(1, len(df) + 1) if i != exclude_index])

        recommended_artist = df.loc[rec_index, 'Artist']
        recommended_song = df.loc[rec_index, 'Song']

        return f'{recommended_artist} - {recommended_song}'

    return 'Unfortunately, the song is not in the hot list!'


## 2. Spotify API

In [6]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=config.client_id,
                                                           client_secret=config.client_secret))


In [7]:
# get playlist tracks
def get_tracks(playlist_id):
    results = sp.user_playlist_tracks('spotify', playlist_id, market="GB")
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks


In [8]:
# extract song ids
def extract_ids(playlist):
    ids = []
    i = 0
    while i < len(playlist):
        ids.append(playlist[i]['track']['id'])
        i += 1
    return ids


In [9]:
# extract song features for each id
def extract_features(ids):
    features_by_id = []
    i = 0

    while i < len(ids):
        try:
            features_by_id.append(sp.audio_features(ids[i])[0])
        except TypeError:
            continue
        finally:
            i += 1

    return features_by_id


In [10]:
# playlist to dataframe
def to_df(playlist_id):
    playlist = get_tracks(playlist_id)
    ids = extract_ids(playlist)
    features = extract_features(ids)

    df = pd.DataFrame({'id': ids, 'features': features})
    df = pd.concat([df.drop(['features'], axis=1), df['features'].apply(pd.Series).drop(['id'], axis=1)], axis=1)
    df.drop(['type', 'uri', 'track_href', 'analysis_url', 'time_signature', 'duration_ms'], axis=1, inplace=True)

    return df


In [11]:
# exe
df = pd.concat([to_df('0iAeUtwINlqfjwAyQ4ykur'),
                to_df('37i9dQZF1DWXWbLEOaHnU3')], ignore_index=True)

# to_df('5S8SJdl1BDc0ugpkEvFsIL')  - 10k 
# to_df('7beGd4yYY1qpsBv6K3clFZ') - 4.5k


## 3. Clustering

In [12]:
# data frama of scaled features
def get_scaled_features(df):
    features = df.drop(['id'], axis=1)
    scaler = StandardScaler()
    scaler.fit(features)

    features_scaled = scaler.transform(features)
    features_scaled_df = pd.DataFrame(features_scaled, columns=features.columns)

    return features_scaled_df



In [13]:
# draw plot for elbow method
def draw_elbow(scaled_features):
    cluster_range = range(2, 21)
    inertia = []

    for i in cluster_range:
        kmeans = KMeans(n_clusters=i, random_state=1234)
        kmeans.fit(scaled_features)
        inertia.append(kmeans.inertia_)

    plt.figure(figsize=(16, 8))
    plt.plot(cluster_range, inertia, 'bx-')
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.xticks(np.arange(min(cluster_range), max(cluster_range)+1, 1.0))
    plt.title('Elbow Method showing the optimal k')
    

scaled_features = get_scaled_features(df)
#draw_elbow(scaled_features)  # k = 4-6

In [14]:
# draw plot for silhouette method
def draw_silhouette(scaled_features):
    cluster_range = range(2, 21)
    silhouette = []

    for i in cluster_range:
        kmeans = KMeans(n_clusters=i, random_state=1234)
        kmeans.fit(scaled_features)

        clusters = kmeans.predict(scaled_features)
        silhouette.append(silhouette_score(scaled_features, clusters))

    plt.figure(figsize=(16,8))
    plt.plot(cluster_range, silhouette, 'bx-')
    plt.xlabel('k')
    plt.ylabel('silhouette score')
    plt.xticks(np.arange(min(cluster_range), max(cluster_range)+1, 1.0))
    plt.title('Silhouette Method showing the optimal k')


scaled_features = get_scaled_features(df)
#draw_silhouette(scaled_features)    # k = 6

In [16]:
def clustering(k, scaled_features):
    kmeans = KMeans(n_clusters=k, random_state=1234)
    kmeans.fit(scaled_features)

    clusters = kmeans.predict(scaled_features)

    return clusters

In [17]:
#exe
scaled_features = get_scaled_features(df)
clusters = clustering(6, scaled_features)

df['cluster'] = clusters