In [2]:
import spotipy
import spotipy.util as util
import time
import timeit
import pandas as pd 
import numpy as np
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
import json
import statsmodels.api as sm
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Preprocessing

In [9]:
# load data csv

df = pd.read_csv('df_by_playlist.csv')
print('dataframe has shape', df.shape)

dataframe has shape (1602, 63)


In [10]:
# normalize the (un-normalized) continuous data

colnames = ['duration_ms','popularity','tempo','num_tracks']
for name in colnames:
    df[name] = df[name]/max(df[name])

In [11]:
# add polynomial features for continuous data

colnames = ['acousticness','danceability','duration_ms','energy','instrumentalness','liveness','loudness','popularity','speechiness','tempo','valence','num_tracks']
polydf = pd.DataFrame(df[colnames])

deg = 2

poly = PolynomialFeatures(degree=deg)
polydftrans = poly.fit_transform(polydf)
target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(polydf.columns,p) for p in poly.powers_]]
polydf = pd.DataFrame(polydftrans, columns = target_feature_names)

df = df.join(polydf)
df = df.drop(colnames,axis=1)
print('dataframe with continuous variable polynomial features has size', df.shape)

dataframe with continuous variable polynomial features has size (1602, 142)


In [12]:
# 50/50 train/test split

np.random.seed(9)
msk = np.random.rand(len(df)) < 0.5
dftr = df[msk]
dfte = df[~msk]

print('training set has shape', dftr.shape)
print('test set has shape', dfte.shape)

training set has shape (792, 142)
test set has shape (810, 142)


# Fit model

In [13]:
# do base line fit using only popularity

X_test = dfte['popularity^1']; X_test = sm.add_constant(X_test)
y_test = np.log(dfte['followers'])
X_train = dftr['popularity^1']; X_train = sm.add_constant(X_train)
y_train = np.log(dftr['followers'])

start = timeit.default_timer()
model = RidgeCV(alphas=(1e-5,1e-4,1e-3,1e-2,1e-1,1,1e1,1e2,1e3,1e4,1e5))
model.fit(X_train,y_train)

predtrain = model.predict(X_train)
predtest = model.predict(X_test)
stop = timeit.default_timer()

R2train = r2_score(y_train,predtrain)
R2test = r2_score(y_test,predtest)
print('Ridge alpha value is',model.alpha_)
print('R2 on training set is',R2train)
print('R2 on test set is',R2test)
print('took',stop - start,'seconds')

Ridge alpha value is 0.1
R2 on training set is 0.190246414835
R2 on test set is 0.186230526451
took 0.003281777258962393 seconds


In [16]:
# do fit using all predictors

X_test = dfte.drop(['playlist_name','followers'],axis=1); X_test = sm.add_constant(X_test)
y_test = np.log(dfte['followers'])
X_train = dftr.drop(['playlist_name','followers'],axis=1); X_train = sm.add_constant(X_train)
y_train = np.log(dftr['followers'])

start = timeit.default_timer()
model = RidgeCV(alphas=(1e-5,1e-4,1e-3,1e-2,1e-1,1,1e1,1e2,1e3,1e4,1e5))
model.fit(X_train,y_train)

predtrain = model.predict(X_train)
predtest = model.predict(X_test)
stop = timeit.default_timer()

R2train = r2_score(y_train,predtrain)
R2test = r2_score(y_test,predtest)
print('Ridge alpha value is',model.alpha_)
print('R2 on training set is',R2train)
print('R2 on test set is',R2test)
print('took',stop - start,'seconds')

Ridge alpha value is 0.1
R2 on training set is 0.577605754863
R2 on test set is 0.464835865076
took 0.013502806425094604 seconds
