In [29]:
import simplejson as json
import pandas as pd
import numpy as np

In [30]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score

In [5]:
def load_files(year):

    with open("best_seller_albums_v3_year_" + str(year)) as json_file:  
        best_seller_albums_data = json.load(json_file)
        best_seller_albums_albums = [Album(d) for d in best_seller_albums_data]

    with open("top_songs_albums_v3_year_" + str(year)) as json_file:  
        top_songs_albums_data = json.load(json_file)
        top_songs_albums_albums = [Album(d) for d in top_songs_albums_data]

    return best_seller_albums_albums, top_songs_albums_albums

In [31]:
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

class Album(object):
    def __init__(self, album_json):
        self.id, self.name, self.genres, self.popularity, self.total_tracks, self.artists_list = [album_json[k] for k in ['id', 'name', 'genres', 'popularity', 'total_tracks', 'artists_list']]
        self.tracks_df = pd.read_json(album_json['tracks_info'], orient='split')

    def unit_transf(self):
        self.tracks_df['tempo'] = self.tracks_df['tempo'] / 60
        self.tracks_df['duration_ms'] = self.tracks_df['duration_ms'] / 1000 / 60
        self.tracks_df['loudness'] = self.tracks_df['loudness'] / 10
        self.tracks_df['ordering'] = MinMaxScaler().fit_transform(self.tracks_df['track_number'].values.reshape(-1,1)) - 0.5
        self.tracks_df['total_tracks'] = self.total_tracks
        
    def classification_label(self):
        self.tracks_df['label'] = self.tracks_df['popularity'] >= self.popularity
        return self.tracks_df['label'].values
        
    def train_X_y(self):
        self.unit_transf()        
        X = self.tracks_df[['mode', 'tempo', 'duration_ms', 'ordering', 'acousticness', 'danceability', 'energy', 'liveness', 'speechiness','valence']].values
        y = self.classification_label()
        return X, y

In [32]:
train_list = []
for year in range(14,19):    
    album_list, song_list = load_files('20' + str(year))
    train_list += (album_list + song_list)

In [37]:
len(train_list)

669

In [33]:
X_ls, y_ls = list(zip(*[a.train_X_y() for a in train_list]))
X = np.vstack(X_ls)
y = np.hstack(y_ls)

In [34]:
y.shape, np.count_nonzero(y)

((8776,), 751)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [36]:
(y_train.shape, np.count_nonzero(y_train)), (y_test.shape, np.count_nonzero(y_test))

(((5879,), 518), ((2897,), 233))

In [10]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [11]:
param_dist = {'objective':'binary:logistic', 'n_estimators':2}

clf = XGBClassifier(**param_dist)

clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        eval_metric='logloss',
        verbose=True)

evals_result = clf.evals_result()
evals_result

[0]	validation_0-logloss:0.626001	validation_1-logloss:0.625036
[1]	validation_0-logloss:0.571185	validation_1-logloss:0.569424


{'validation_0': {'logloss': [0.626001, 0.571185]},
 'validation_1': {'logloss': [0.625036, 0.569424]}}

In [12]:
y_est1 = clf.predict(X_test)

In [13]:
from sklearn.metrics import precision_score
import numpy as np

def precision_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-precision_score(y_true, np.round(y_pred))
    return 'precision_err', err

clf_wine = XGBClassifier(objective='binary:logistic', n_jobs=8)


clf_wine.fit(X_train, y_train,eval_metric=precision_eval,verbose=True)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=8,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [14]:
y_est2 = clf_wine.predict(X_test)

In [15]:
clf_wine.predict_proba(X_test)

array([[0.962747  , 0.03725303],
       [0.9489279 , 0.05107209],
       [0.9197687 , 0.08023134],
       ...,
       [0.9490032 , 0.05099678],
       [0.78019834, 0.21980163],
       [0.8955257 , 0.10447431]], dtype=float32)

In [16]:
precision_score(y_test, y_est2) 

0.631578947368421

In [17]:
precision_score(y_test, y_est1)  

0.2

In [18]:
np.count_nonzero(y_est2), len(y_est2)

(19, 2897)

In [19]:
type(y_est2)

numpy.ndarray

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [22]:
model = XGBClassifier(objective='binary:logistic', n_jobs=8)
n_estimators = range(50, 400, 50)
param_grid = dict(n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss",  cv=kfold)
grid_search.fit(X_train, y_train,eval_metric=precision_eval,verbose=True)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=7, shuffle=True),
             error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=8,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'n_estimators': range(50, 400, 50)},
             pre_dispatch='2*n_jobs

In [23]:
y_est3 = grid_search.predict(X_test)

In [24]:
np.count_nonzero(y_est3), len(y_est3)

(68, 2897)

In [25]:
precision_score(y_test, y_est3) 

0.7941176470588235

In [28]:
import dill
dill.dump(grid_search, open('model.pkd', 'wb'))

In [16]:
import heapq
def hit_song_predictor(album):
    X, y = album.train_X_y()
    y_est = grid_search.predict(X)
    indices = np.arange(album.total_tracks)[y_est]
    if len(indices):
        return album.tracks_df[[ 'track_number',  'name']].values[indices], True
    else:
        y_est_prob_mtx = grid_search.predict_proba(X)
        y_est_prob_mtx[:, 0] = np.arange(album.total_tracks)
        indices = np.array(heapq.nlargest(3, y_est_prob_mtx, key=lambda x: x[1]))[:,0].astype(int)
        return album.tracks_df[[ 'track_number',  'name']].values[indices], False

In [22]:
album_list, song_list = load_files(2014)
a = (album_list + song_list)[8]

In [23]:
a.name, a.artists_list

('Platinum', ['Miranda Lambert'])

In [24]:
hit_song_predictor(a)

(array([[12, "Somethin' Bad - (Duet with Carrie Underwood)"]], dtype=object),
 True)

In [33]:
a.tracks_df

Unnamed: 0,id,track_number,popularity,name,duration_ms,tempo,time_signature,key,valence,mode,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,ordering,total_tracks,label
0,7rxxROM7x948jnVwD0ORVN,1,38,Walk The Line,3.64135,1.247633,4,7,0.751,1,0.27,0.515,0.904,0.0,0.28,-0.6103,0.302,-0.5,12,False
1,4m2k8jQNbNsAINiip5qKSN,2,36,Don't Need Y'all,3.550267,1.8992,4,4,0.429,0,0.564,0.798,0.678,1.3e-05,0.145,-0.5341,0.238,-0.409091,12,False
2,598B7nC3ukFWx5eqv1Ft6i,3,33,100,4.161933,2.49935,4,2,0.574,1,0.011,0.775,0.714,2e-06,0.0753,-0.5307,0.0566,-0.318182,12,False
3,4gbfxlzThi6Hi0KnV1wH4t,4,46,Change Your Life,3.67915,2.199633,4,6,0.688,0,0.0208,0.678,0.763,0.0,0.117,-0.559,0.208,-0.227273,12,False
4,3H9DlQEsjB0whMWy9g4MCX,5,43,Fancy,3.3323,1.5829,4,10,0.374,0,0.107,0.911,0.71,0.0,0.049,-0.4137,0.0697,-0.136364,12,False
5,5CRZfDIaFlVMW61WzW2qVu,6,41,New Bitch,3.630783,2.23305,4,8,0.358,1,0.027,0.583,0.742,0.0,0.232,-0.5711,0.172,-0.045455,12,False
6,3yx6eryOZgO54bt3B671cn,7,58,Work,3.71995,2.33355,4,7,0.443,1,0.0604,0.697,0.808,0.0,0.435,-0.4928,0.155,0.045455,12,False
7,0wjP81vqcXgjkCZDdIdwoL,8,36,Impossible Is Nothing,3.1755,1.2911,4,9,0.382,1,0.156,0.665,0.602,0.0,0.111,-0.6656,0.0596,0.136364,12,False
8,7cwxv3nrySXbOoXr9Xl2F3,9,37,Goddess,3.16695,2.28995,5,1,0.61,1,0.0817,0.516,0.748,3e-06,0.129,-0.4519,0.309,0.227273,12,False
9,7pNC5ZIKtwUK0ReSpM3P9f,10,66,Black Widow,3.490383,2.733183,4,3,0.527,0,0.181,0.741,0.726,0.000189,0.111,-0.377,0.143,0.318182,12,True
