## Section 0. Import Packages

In [61]:
# Import Basic Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spotipy
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import PolynomialFeatures
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import random
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 200)

# Import Regression Packages
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,RandomForestRegressor,ExtraTreesRegressor,BaggingRegressor
from sklearn.linear_model import LinearRegression,HuberRegressor,ElasticNet,LassoCV,RidgeCV,PassiveAggressiveRegressor,SGDRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.grid_search import GridSearchCV

## Section 1. Data Pre-Processing

### Section 1.1 Reading Dataframe

In [3]:
# Read-in the full data set
data = pd.read_csv('Final_Dataframe.csv')

In [4]:
# Drop name column
data = data.drop("Unnamed: 0", axis=1)

In [5]:
word_lis = []
for word in data.columns:
    word = word.lstrip(' ').strip('\'')
    word_lis.append(word)
    
data.columns = word_lis

In [6]:
# Remove duplicate columns by name
def remove_dup_columns(frame):
    keep_names = set()
    keep_icols = list()
    for icol, name in enumerate(frame.columns):
        if name not in keep_names:
            keep_names.add(name)
            keep_icols.append(icol)
    return frame.iloc[:, keep_icols]

data_sub = remove_dup_columns(data)

In [7]:
data_master = pd.read_csv('../spotify_data_master.csv').drop("Unnamed: 0", axis=1)
data_master.head()

Unnamed: 0,acousticness_mean,acousticness_std,dance_mean,dance_std,energy_mean,energy_std,instrumentalness_mean,instrumentalness_std,key_mean,key_std,liveness_mean,liveness_std,loudness_mean,loudness_std,mode_mean,mode_std,speech_mean,speech_std,tempo_mean,tempo_std,time_mean,time_std,valence_mean,valence_std,Followers,ID,followers_mean,followers_std,popularity_mean,popularity_std,top_0_10,top_10_20,top_20_30,top_30_40,top_40_50,Playlist_Followers,"""children's christmas""","""children's christmas"".1","""children's music""","""children's music"".1",'acid house','acid techno','adult standards','afrobeat','afrobeats','album rock','alternative country','alternative country'.1,'alternative dance','alternative hip hop','alternative hip hop'.1,'alternative metal','alternative pop','alternative rock','alternative roots rock','ambeat','ambient idm','ambient','anime score','anthem emo','anthem emo'.1,'anthem worship','anti-folk','antiviral pop','appalachian folk','art rock','athens indie','atmospheric post-metal','aussietronica','austindie','australian alternative rock','australian dance','australian dance'.1,'australian hip hop','australian indie','australian pop','australian pop'.1,'avant-garde jazz','avant-garde','avantgarde metal','azonto','azonto'.1,'azontobeats','azontobeats'.1,'balearic','ballroom','baroque ensemble','bass music','bass trap','bassline','bay area indie','bay area indie'.1,'bebop','belly dance','big band','big beat','big room','big room'.1,'black death','black metal',...,'swing','swiss rock','symphonic black metal','synthpop','talent show','tango','tango'.1,'tech house','technical brutal death metal','technical brutal death metal'.1,'technical death metal','teen pop','teen pop'.1,'texas blues','texas country','theme','thrash-groove metal','throat singing','tin pan alley','tin pan alley'.1,'tone','tracestep','tracestep'.1,'traditional british folk','traditional country','traditional folk','traditional funk','traditional rockabilly','traditional scottish folk','traditional soul','traditional swing','trap francais','trap latino','trap music','trash rock','triangle indie','tribal house','tribal house'.1,'tribute','trip hop','tropical house','trova','turbo folk','turkish folk','turkish jazz','turkish jazz'.1,'turntablism','twee pop','tzadik','uk drill','uk hip hop','ukulele','unblack metal','underground hip hop','underground latin hip hop','underground pop rap','underground power pop','underground rap','vancouver indie','vapor house','vapor pop','vapor pop'.1,'vapor soul','vapor soul'.1,'vapor twitch','vaporwave','vegas indie','video game music','vintage jazz','vintage reggae','vintage swedish pop','vintage swing','vintage western','vintage western'.1,'violin','viral pop','vocal house','vocal jazz','vocaloid','voidgaze','warm drone','welsh rock','west coast trap','wind ensemble','wonky','world chill','world christmas','world fusion','world meditation','world','wrestling','wrock','ye ye','yoik','zapstep','zeuhl','zim','zolo','zydeco','no_genre'
0,0.641282,0.326942,0.467911,0.241057,0.27594,0.225821,0.11965,0.277109,0.27594,0.225821,0.19944,0.163102,-18.000646,8.98404,0.630769,0.486352,0.383051,0.403365,101.045969,51.857504,3.338462,1.553996,0.319263,0.246235,24.0,01WIu4Rst0xeZnTunWxUL7,134413.666667,365459.0,42.833333,19.575645,0,0,0,0,0,24.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0.278816,0.262749,0.634392,0.14027,0.596,0.166902,0.192559,0.34146,0.596,0.166902,0.16449,0.128725,-9.525804,3.561653,0.509804,0.504878,0.08221,0.131105,122.768255,28.215783,4.0,0.2,0.656235,0.245299,330.0,05dTMGk8MjnpQg3bKuoXcc,103320.580645,332015.0,48.903226,15.029648,0,0,0,0,0,330.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0.22881,0.251421,0.6004,0.178801,0.6122,0.192433,0.179571,0.336604,0.6122,0.192433,0.192563,0.129795,-7.845333,3.252999,0.666667,0.479463,0.05215,0.025935,114.439167,21.997673,4.0,0.262613,0.481787,0.251199,73.0,070FVPBKvfu6M5tf4I9rt2,566814.56,1427308.0,60.28,15.512146,0,0,0,1,0,73.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0.394114,0.362573,0.599424,0.151256,0.541097,0.289705,0.203059,0.332371,0.541097,0.289705,0.211488,0.1909,-9.764303,5.521358,0.606061,0.496198,0.106724,0.112448,110.134788,25.125111,4.0,0.353553,0.511997,0.243171,6173.0,08vPKM3pmoyF6crB2EtASQ,199831.484848,295385.9,58.69697,15.62747,0,0,0,0,0,6173.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0.194509,0.27847,0.531067,0.150001,0.7594,0.249805,0.115499,0.25802,0.7594,0.249805,0.234787,0.143076,-6.465367,4.054067,0.666667,0.479463,0.12972,0.223108,124.7895,26.993835,3.933333,0.253708,0.443407,0.253724,145.0,08ySLuUm0jMf7lJmFwqRMu,223253.774194,491843.8,49.516129,19.489948,0,0,0,0,0,145.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [8]:
# ID and Password for accessing Spotify API
client_id = "14aed07d00e342a6a989d64d6c7f3d02"
client_secret = "19f0f88834b6425aa9b8bc129279426f"
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Section 1.2 Training/Test Data Split

In [9]:
# A train/test split is constructed where 90% of the subsample is 
# the train data set and 10% the test data set.

# Set train and test sizes
train_size = 0.9
test_size = 1-train_size

# Function to return random train and test sets
def data_splitter(df, train, validate=False, seed=9001):
    
    if validate:
        np.random.seed(seed)
        perm = np.random.permutation(df.index)
        m = len(df)
        train_end = int(train * m)
        validate_end = int(validate * m) + train_end
        train = df.ix[perm[:train_end]]
        validate = df.ix[perm[train_end:validate_end]]
        test = df.ix[perm[validate_end:]]
        return train, validate, test
    else:
        np.random.seed(seed)
        perm = np.random.permutation(df.index)
        m = len(df)
        train_end = int(train * m)
        train = df.ix[perm[:train_end]]
        test = df.ix[perm[train_end:]]
        return train, test

In [36]:
# Create train and test dataframes from subsample
train_df, test_df = data_splitter(data_sub, train_size)

# Return shapes of train and test dataframes
print("Train Size: {}".format(train_df.shape))
print("Test Size: {}".format(test_df.shape))

Train Size: (1278, 729)
Test Size: (142, 729)


### Section 1.3 Standardization

In [37]:
# List all numerical columns to be used for classification
numerical_columns = ['acousticness_mean','acousticness_std','dance_mean','dance_std',\
                    'energy_mean','energy_std','instrumentalness_mean','instrumentalness_std',\
                    'key_mean','key_std','liveness_mean','liveness_std','loudness_mean',\
                    'loudness_std','mode_mean','mode_std','speech_mean','speech_std',\
                    'tempo_mean','tempo_std','valence_mean','valence_std','followers_mean',\
                    'followers_std','popularity_mean','popularity_std',\
                    'house_acousticness_mean', 'hip hop_acousticness_std','pop_liveness_std', \
                     'dance_liveness_std', 'r&b_acousticness_std','rap_energy_std', 'rap_key_std',\
                     'acoustic_acousticness_std','acoustic_acousticness_mean', 'acoustic_energy_std',\
                     'acoustic_key_std']

In [38]:
# The numerical columns are standardized next
mean = train_df[numerical_columns].mean()
std = train_df[numerical_columns].std()

train_df[numerical_columns] = (train_df[numerical_columns] - mean)/std
test_df[numerical_columns] = (test_df[numerical_columns] - mean)/std

### Section 1.4 Imputation

In [39]:
# Find Missing Columns
null_vals = train_df.isnull().sum()
missing_vals = null_vals[null_vals > 0].index.tolist()

#### Median-Based Imputation

In [40]:
# Median imputation of missing values
imp = Imputer(missing_values='NaN', strategy='median', axis=1)
train_df = pd.DataFrame(imp.fit_transform(train_df), columns=data_sub.columns)
test_df = pd.DataFrame(imp.transform(test_df), columns=data_sub.columns)

In [41]:
# Split training and test data
train_df = train_df[train_df['Followers'] != 0]
test_df = test_df[test_df['Followers'] != 0]

In [42]:
# Final step: create y_train/x_train and y_test/x_test dataframes

# Initialize the training data
y_train = np.log(train_df['Followers'])
x_train = train_df.drop('Followers', axis=1)

# Initialize the testing data
y_test = np.log(test_df['Followers'])
x_test = test_df.drop('Followers', axis=1)

In [62]:
# GradientBoostingRegressor
model = GradientBoostingRegressor(alpha=0.99, 
                                  loss='huber', 
                                  max_depth=5, 
                                  learning_rate=0.04, 
                                  n_estimators=200, 
                                  max_features='auto')
model.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.99, criterion='friedman_mse', init=None,
             learning_rate=0.04, loss='huber', max_depth=5,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=200, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

## Generating Successful Playlists

### Methodology

- According to Ben Gorman, if Linear Regression were a Toyota Camry, the Gradient Boosting Regressor would easily be a UH-60 Blackhawk Helicopter
- Gradient Boosting Regressor is an ensemble machine learning procedure that fits new models consecutively to provide a more reliable estimate of the response variable. It constructs new base-learners to be correlated with the negative gradient of the loss function 
 - least square regression (ls), 
 - least absolute deviation (lad), 
 - huber (a combination of ls and lad), 
 - quantile - which allows for quantile regression
- The choice of the loss function allows for great flexibility in Gradient Boosting and the best error function is huber for our model based on trial and error / cross-validation

In [59]:
def optimized_playlist(style, song_count):
    '''Returns playlist songs most-likely to be popular in style'''
    
    play_index = np.argmax(model.predict(x_train[x_train[style] == 1.0]))
    data_index = x_train[x_train[style] == 1.0].index.tolist()[play_index]
    playlist_id = data_master.iloc[data_index]["ID"]
    
    results = sp.user_playlist_tracks('spotify', playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    
    songs_playlist = []
    for item,song in enumerate(tracks):
        song_name = tracks[item]['track']['name']
        songs_playlist.append(song_name)
    
    sample = random.sample(songs_playlist,song_count)
    return sample

### Examples

In [63]:
optimized_playlist("hip hop",5)

['Bad Girls',
 'Independent Women, Pt. 1',
 'Before He Cheats',
 'White Flag',
 "Bitch I'm Madonna"]