In [45]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import random

np.random.seed(420)
random.seed(420)

In [2]:
def loadData(name="data/spotifyCatalog.csv"):
    return pd.read_csv(name)

data = loadData()

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,album_id,album_name,artist_ids,artist_names,duration_ms,id,name,us_available,year,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genreArrStr
0,354801,50hRYWK49z2kBf6cMxHmZe,ジムノペディ サティ作品集 II,"['459INk8vcC0ebEef82WjIK', '6ltV1dxtdNmFBvpZz2...","['Erik Satie', '白石光隆']",68268,6VKX9Dheh0qLmjfK2Npq4z,ばら十字教団の最初の思想,True,2012,...,874,0,107,-30.795,1000,57,139.804,4,188,['classical']
1,553689,51WBTsiitqhIE9DkHNvZqR,Necessary Roughness,['5CxwOBCoGSvWxfDgNoa1Nv'],['The Lady Of Rage'],237026,3ptH0tXq7HEaIWxyn7vkOy,Get With Da Wickedness,True,2001,...,0,11,322,-5.39,0,357,88.183,4,802,"['pop', 'rap', 'hip', 'funk']"
2,290293,57CBQ2KFwHoY3vJGHgofCo,Retreat,['4qDGDPGMIJuIvPfUGe0Ngg'],['Cutty Ranks'],230866,0SPxr6sGUeRlTHndjOryKT,Me Fit,True,1991,...,0,6,39,-17.489,0,98,172.787,4,967,['dance']
3,321077,6HT1eWnFxuB1apcSuOzpd2,Puccini: Boheme (La),"['0OzxPXyowUEQ532c9AmHUR', '7oPmR7kujiCfv7EjD1...","['Giacomo Puccini', 'Stanislav Beňačka', 'Carm...",115066,5q9ClErLj0ZHXiAkUU0Rsh,"La boheme: Act II: Signorina Mimi (Marcello, M...",True,1990,...,0,9,667,-19.135,1000,49,125.942,3,202,"['classical', 'folk']"
4,560873,2RmEcwZVXG5Z1rHED7MYhs,Kompilation,['51qSeH9HimuYMMQ7qbWGrk'],['Jürgen Paape'],357921,1T3UB1f8rkB72u6GeTRKrp,Fruity Loops #2,True,2011,...,775,10,81,-12.442,1000,48,123.996,4,354,"['electr', 'house']"


# Predicting Track Release Year

### Generating Data

In [5]:
data.columns

Index(['Unnamed: 0', 'album_id', 'album_name', 'artist_ids', 'artist_names',
       'duration_ms', 'id', 'name', 'us_available', 'year', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence',
       'genreArrStr'],
      dtype='object')

In [18]:
features = ['duration_ms', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'valence', 'year']

In [37]:
x = data[features].values
train, test = train_test_split(x, train_size=0.9)
train, val = train_test_split(train, train_size=0.7)
x_train, y_train = train[:,:-1], train[:,-1]
x_val, y_val = val[:,:-1], val[:,-1]
x_test, y_test = test[:,:-1], test[:,-1]

In [38]:
x_train.shape, x_val.shape, x_test.shape

((216475, 12), (92776, 12), (34362, 12))

### Standardizing Data

In [137]:
coeffs = None

def standardize(data):
    """divide each column by the column's maximum"""
    global coeffs
    if coeffs is None:
        coeffs = np.max(data, axis=0)
    data = data.copy()
    return data*1/coeffs

def featurize(data):
#     l = data.shape[1]
#     for i in range(l):
# #         for ii in range(l):
#             newCrossTerm = np.multiply(data[:,[i]], data[:,[i]])
#             data = np.hstack((data, newCrossTerm))
    return data

x_train_std = featurize(standardize(x_train))
x_val_std = featurize(standardize(x_val))
x_test_std = featurize(standardize(x_test))

In [138]:
x_train_std.shape

(216475, 12)

### Fitting Ridge Regression Model To Predict Song Year

In [139]:
year_model = linear_model.Ridge(alpha=0)
year_model.fit(x_train_std, y_train)
mean_absolute_error(y_train, year_model.predict(x_train_std))

12.217835815856255

In [140]:
mean_absolute_error(y_val, year_model.predict(x_val_std))

12.138356977749075

In [143]:
year_model.predict([[0,0,0,0,0,0,0,0,0,0,0,0]])

array([2008.1122257])

In [144]:
year_model.intercept_

2008.1122257010538