In [8]:
import simplejson as json
import pandas as pd
import numpy as np

In [84]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score

In [9]:
def load_files(year):

    with open("best_seller_albums_v3_year_" + str(year)) as json_file:  
        best_seller_albums_data = json.load(json_file)
        best_seller_albums_albums = [Album(d) for d in best_seller_albums_data]

    with open("top_songs_albums_v3_year_" + str(year)) as json_file:  
        top_songs_albums_data = json.load(json_file)
        top_songs_albums_albums = [Album(d) for d in top_songs_albums_data]

    return best_seller_albums_albums, top_songs_albums_albums

In [70]:
train_list = []
for year in range(14,19):    
    album_list, song_list = load_files('20' + str(year))
    train_list += (album_list + song_list)

In [77]:
X_ls, y_ls = list(zip(*[a.train_X_y() for a in train_list]))
X = np.vstack(X_ls)
y = np.hstack(y_ls)



In [78]:
y.shape, np.count_nonzero(y)

((8776,), 751)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [82]:
(y_train.shape, np.count_nonzero(y_train)), (y_test.shape, np.count_nonzero(y_test))

(((5879,), 518), ((2897,), 233))

In [85]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [87]:
param_dist = {'objective':'binary:logistic', 'n_estimators':2}

clf = XGBClassifier(**param_dist)

clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        eval_metric='logloss',
        verbose=True)

evals_result = clf.evals_result()
evals_result

[0]	validation_0-logloss:0.626001	validation_1-logloss:0.625036
[1]	validation_0-logloss:0.571185	validation_1-logloss:0.569424


{'validation_0': {'logloss': [0.626001, 0.571185]},
 'validation_1': {'logloss': [0.625036, 0.569424]}}

In [88]:
y_est1 = clf.predict(X_test)

  if diff:


In [89]:
from sklearn.metrics import precision_score
import numpy as np

def precision_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-precision_score(y_true, np.round(y_pred))
    return 'precision_err', err

clf_wine = XGBClassifier(objective='binary:logistic', n_jobs=8)


clf_wine.fit(X_train, y_train,eval_metric=precision_eval,verbose=True)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=8, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [90]:
y_est2 = clf_wine.predict(X_test)

  if diff:


In [100]:
clf_wine.predict_proba(X_test)

array([[0.962747  , 0.03725303],
       [0.9489279 , 0.05107209],
       [0.9197687 , 0.08023134],
       ...,
       [0.9490032 , 0.05099678],
       [0.78019834, 0.21980163],
       [0.8955257 , 0.10447431]], dtype=float32)

In [91]:
precision_score(y_test, y_est2) 

0.631578947368421

In [92]:
precision_score(y_test, y_est1)  

0.2

In [98]:
np.count_nonzero(y_est2), len(y_est2)

(19, 2897)

In [99]:
type(y_est2)

numpy.ndarray

In [64]:
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

class Album(object):
    def __init__(self, album_json):
        self.id, self.name, self.genres, self.popularity, self.total_tracks, self.artists_list = [album_json[k] for k in ['id', 'name', 'genres', 'popularity', 'total_tracks', 'artists_list']]
        self.tracks_df = pd.read_json(album_json['tracks_info'], orient='split')

    def unit_transf(self):
        self.tracks_df['tempo'] = self.tracks_df['tempo'] / 60
        self.tracks_df['duration_ms'] = self.tracks_df['duration_ms'] / 1000 / 60
        self.tracks_df['loudness'] = self.tracks_df['loudness'] / 10
        self.tracks_df['ordering'] = MinMaxScaler().fit_transform(self.tracks_df['track_number'].values.reshape(-1,1)) - 0.5
        self.tracks_df['total_tracks'] = self.total_tracks
        
    def classification_label(self):
        self.tracks_df['label'] = self.tracks_df['popularity'] >= self.popularity
        return self.tracks_df['label'].values
        
    def train_X_y(self):
        self.unit_transf()        
        X = self.tracks_df[['mode', 'tempo', 'duration_ms', 'ordering', 'acousticness', 'danceability', 'energy', 'liveness', 'speechiness','valence']].values
        y = self.classification_label()
        return X, y

In [32]:
X_train, y_train = a.train_X_y()



(16, 10)

In [101]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [163]:
import dill
dill.dump(grid_search, open('model.pkd', 'wb'))

In [106]:
model = XGBClassifier(objective='binary:logistic', n_jobs=8)
n_estimators = range(50, 400, 50)
param_grid = dict(n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss",  cv=kfold)
grid_search.fit(X_train, y_train,eval_metric=precision_eval,verbose=True)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=7, shuffle=True),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=8, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(50, 400, 50)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0)

In [108]:
y_est3 = grid_search.predict(X_test)

  if diff:


In [111]:
np.count_nonzero(y_est3), len(y_est3)

(68, 2897)

In [113]:
precision_score(y_test, y_est3) 

0.7941176470588235

In [154]:
import heapq
def hit_song_predictor(album):
    X, y = album.train_X_y()
    y_est = grid_search.predict(X)
    indices = np.arange(album.total_tracks)[y_est]
    if len(indices):
        return album.tracks_df[[ 'track_number',  'name']].values[indices], True
    else:
        y_est_prob_mtx = grid_search.predict_proba(X)
        y_est_prob_mtx[:, 0] = np.arange(album.total_tracks)
        indices = np.array(heapq.nlargest(3, y_est_prob_mtx, key=lambda x: x[1]))[:,0].astype(int)
        return album.tracks_df[[ 'track_number',  'name']].values[indices], False

In [161]:
a = list_2014[10]
print(hit_song_predictor(a))
a.tracks_df

(array([[5, 'West Coast'],
       [11, 'The Other Woman'],
       [1, 'Cruel World']], dtype=object), False)


  if diff:


Unnamed: 0,id,track_number,popularity,name,duration_ms,tempo,time_signature,key,valence,mode,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,ordering,total_tracks,label
0,6ayKgleoMF26lvU1Z7wVm7,1,58,Cruel World,5.132252999999999e-19,1.571039e-07,4,5,0.177,0,0.274,0.33,0.507,5e-06,0.122,-6.4e-05,0.0339,-0.5,14,False
1,1y3r6RXiJZNBV1EI0NggpS,2,65,Ultraviolence,3.2340529999999995e-19,1.019316e-07,3,0,0.0899,1,0.278,0.144,0.541,1e-05,0.101,-7.1e-05,0.038,-0.423077,14,False
2,4VSg5K1hnbmIg4PwRdY6wV,3,64,Shades Of Cool,4.399343999999999e-19,1.773637e-07,3,2,0.0878,0,0.554,0.262,0.482,0.0161,0.244,-6.9e-05,0.0302,-0.346154,14,False
3,1NZs6n6hl8UuMaX0UC0YTz,4,66,Brooklyn Baby,4.522119e-19,1.385635e-07,4,1,0.0949,1,0.542,0.404,0.664,0.00402,0.11,-6.7e-05,0.0407,-0.269231,14,False
4,5Y6nVaayzitvsD5F7nr3DV,5,68,West Coast,3.3019549999999997e-19,1.583938e-07,4,6,0.461,0,0.194,0.527,0.591,0.0194,0.0907,-7.7e-05,0.0396,-0.192308,14,False
5,11MyiSGZSYSmhhqwGUTtAq,6,61,Sad Girl,4.0864199999999995e-19,1.53651e-07,4,5,0.258,1,0.539,0.345,0.587,0.0731,0.119,-8.3e-05,0.0364,-0.115385,14,False
6,6PnluwP0fjGnpIBsqTdUTq,7,61,Pretty When You Cry,3.01115e-19,1.407356e-07,4,10,0.197,0,0.399,0.509,0.547,0.0,0.215,-6.5e-05,0.0417,-0.038462,14,False
7,7Ms58r8G6Y0r1XLtpwoGxN,8,62,Money Power Glory,3.480967e-19,1.758771e-07,5,8,0.288,0,0.583,0.349,0.525,0.0,0.35,-6.8e-05,0.0451,0.038462,14,False
8,70Laus6ozJIHDpfTtUSmAZ,9,59,Fucked My Way Up To The Top,2.729257e-19,1.66974e-07,4,5,0.182,1,0.601,0.52,0.69,0.201,0.137,-5.9e-05,0.0554,0.115385,14,False
9,6IfPyMb0Sxptpx6jBUATOS,10,63,Old Money,3.4912549999999996e-19,1.390368e-07,1,10,0.154,0,0.888,0.312,0.168,7e-06,0.147,-0.000125,0.0332,0.192308,14,False
