In [6]:
import requests
import dill
from bs4 import BeautifulSoup
from datetime import datetime
import simplejson as json
import pandas as pd
import numpy as np
from urllib.parse import quote
from urllib.request import urlopen
import spotipy
import time
import re

In [7]:
from spotipy.oauth2 import SpotifyClientCredentials
client_credentials_manager = spotipy.oauth2.SpotifyClientCredentials('d6967ce2057448d4aab3ad9898119c97',  'ad7f82cc26a64f1595b6b3c4cd917243')
spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [2]:
import dill
model = dill.load(open('model.pkd', 'rb'))

In [21]:
import heapq
def hit_song_predictor(album, model):
    X, y = album.train_X_y()
    y_est = model.predict(X)
    indices = np.arange(album.total_tracks)[y_est]
    if len(indices):
        return album.tracks_df[[ 'track_number',  'name']].values[indices], True
    else:
        y_est_prob_mtx = model.predict_proba(X)
        y_est_prob_mtx[:, 0] = np.arange(album.total_tracks)
        indices = np.array(heapq.nlargest(3, y_est_prob_mtx, key=lambda x: x[1]))[:,0].astype(int)
        return album.tracks_df[[ 'track_number',  'name']].values[indices], False

In [3]:
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

class Album(object):
    def __init__(self, album_json):
        self.id, self.name, self.genres, self.popularity, self.total_tracks, self.artists_list = [album_json[k] for k in ['id', 'name', 'genres', 'popularity', 'total_tracks', 'artists_list']]
        self.tracks_df = pd.read_json(album_json['tracks_info'], orient='split')

    def unit_transf(self):
        self.tracks_df['tempo'] = self.tracks_df['tempo'] / 60
        self.tracks_df['duration_ms'] = self.tracks_df['duration_ms'] / 1000 / 60
        self.tracks_df['loudness'] = self.tracks_df['loudness'] / 10
        self.tracks_df['ordering'] = MinMaxScaler().fit_transform(self.tracks_df['track_number'].values.reshape(-1,1)) - 0.5
        self.tracks_df['total_tracks'] = self.total_tracks
        
    def classification_label(self):
        self.tracks_df['label'] = self.tracks_df['popularity'] >= self.popularity
        return self.tracks_df['label'].values
        
    def train_X_y(self):
        self.unit_transf()        
        X = self.tracks_df[['mode', 'tempo', 'duration_ms', 'ordering', 'acousticness', 'danceability', 'energy', 'liveness', 'speechiness','valence']].values
        y = self.classification_label()
        return X, y

In [13]:
def strip_Feat(artists):
    pos = artists.find('Featuring')
    if pos == -1:
        return artists
    else:
        return artists[:pos].strip()
    
def get_track_artists(year, domain = 'https://www.billboard.com/charts/year-end/'):
    path = domain + str(year) + '/hot-100-songs'
    response = requests.get(path)
    soup = BeautifulSoup(response.text, "lxml")

    parents = soup.findAll(class_=r'ye-chart-item__text')

    track_artists = []
    for parent in parents:
        track = parent.find('div',attrs={'class':'ye-chart-item__title'}).get_text().strip()
        artists = parent.find('div',attrs={'class':'ye-chart-item__artist'}).get_text().strip()
        track_artists.append((track, artists))
    return track_artists


def get_album_artists(year, domain='https://www.billboard.com/archive/charts/'):
    response = requests.get(domain + str(year) + "/top-album-sales")
    soup = BeautifulSoup(response.text, "lxml")
    table = soup.find('table',attrs={'class':'archive-table'}).findAll('tr')[1:]

    album_artists=[]
    for tr in table:
        td = tr.findAll('td')
        try:
            album_artists.append((td[-2].get_text(),td[-1].get_text()))
        except:
            pass
    return album_artists

def get_track_artists_arxiv(year, domain='https://www.billboard.com/archive/charts/'):
    path = domain + str(year) + '/hot-100'
    response = requests.get(path)
    soup = BeautifulSoup(response.text, "lxml")

    table = soup.find('table',attrs={'class':'archive-table'}).findAll('tr')[1:]

    track_artists=[]
    for tr in table:
        td = tr.findAll('td')
        try:
            track_artists.append((td[-2].get_text(),td[-1].get_text()))
        except:
            pass
    return track_artists

def get_album_id_from_track(track, artists):
    try:
        track = track.replace("'",'')
        artists = ", ".join(re.split(r' & | x | X | With | with ', artists))
        q = 'track:' + track + ' artist:' + artists
        result = spotify.search(q, limit=1)
        the_first_album = result['tracks']['items'][0]['album']
        album_id = the_first_album.get('id')
        return album_id
    except:
#         try:
#             artists = ' '.join(artists.split(' x '))
#             artists = ' '.join(artists.split(' X '))
#             q = track + ' ' + artists
#             result = spotify.search(q, limit=1)
#             the_first_album = result['tracks']['items'][0]['album']
#             album_id = the_first_album.get('id')
#             return album_id
#         except:
        return track, artists

def get_album_id_from_album(album, artists):

    try:
        album = album.replace("'",'')        
        artists = ", ".join(re.split(r' & | x | X | With | with ', artists))
        q = 'album:' + album + ' artist:' + artists
        result = spotify.search(q, type='album', limit=1)
        the_first_album = result['albums']['items'][0]
        album_id = the_first_album.get('id')
        return album_id
    
    except:
#         try:
#             q =  album
#             result = spotify.search(q, type='album', limit=1)
#             the_first_album = result['albums']['items'][0]
#             album_id = the_first_album.get('id')
#             return album_id
#         except:
        return album, artists

def get_albums_id_list(year, get_album_id_func, scraping_func):
    albums_id = []    


    trackOrAlbum_artists = scraping_func(year)
    for trackOrAlbum, artists in trackOrAlbum_artists:
        albums_id.append(get_album_id_func(trackOrAlbum,strip_Feat(artists)))
        
    return albums_id

def album_json(album_id):

    d = spotify.album(album_id)
    subkeys = ['id', 'name', 'genres',  'popularity', 'total_tracks']

    subkeys = ['id', 'name', 'genres',  'popularity', 'total_tracks']
    
    album_info = {k: d[k] for k in subkeys if k in d}
    tracks = d['tracks']['items']
    tracks_id = [t['id'] for t in tracks]
    tracks_df_json = pd.DataFrame(audio_features(tracks_id),columns=['id',  'track_number', 'popularity','name', 'duration_ms', 'tempo','time_signature', 'key',
       'valence', 'mode', 'acousticness', 'danceability', 'energy', 
       'instrumentalness',  'liveness', 'loudness', 'speechiness']).to_json(orient='split')
    album_info.update({'artists_list': [a['name'] for a in d['artists']], 'tracks_info': tracks_df_json })
    
    return album_info

def track_json(track_id):
    d = spotify.track(track_id)
    subkeys = ['id', 'popularity', 'name', 'track_number']
    subdict = {k: d[k] for k in subkeys if k in d}
    return subdict


def audio_feature_decorator(spotify_audio_features_func):

    def wrapper_func(track_id_list):
        subkeys = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
        
        
        features_list = spotify_audio_features_func(track_id_list)
        
        my_features_list = []
        for i, features in enumerate(features_list):
            sub_features = {k: features[k] for k in subkeys if k in features}
            other_info = track_json(track_id_list[i])
            other_info.update(sub_features)
            
            my_features_list.append(other_info)
            
        return my_features_list
    
    return wrapper_func

@audio_feature_decorator
def audio_features(track_id_list):
    return spotify.audio_features(track_id_list)


In [None]:
album_id = get_album_id_from_album('Ricky Sings Again', 'Ricky Nelson')

In [15]:
abj = album_json(album_id)
a = Album(abj)

In [17]:
X, y = a.train_X_y()



In [22]:
hit_song_predictor(a, model)

  if diff:


(array([[3, 'Believe What You Say - Remastered'],
        [1, "It's Late"],
        [12, 'Restless Kid - Remastered']], dtype=object), False)