In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.api import OLS
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# Load in data
df = pd.read_csv('total_info.csv', encoding = 'latin-1')

# Clean Variables
del df['Unnamed: 0']
df['artist_popularity'] = pd.to_numeric(df['artist_popularity'], errors = 'coerce')
df['artist_followers'] = pd.to_numeric(df['artist_followers'], errors = 'coerce')

# Create pop indicator variable array
pop_ind = []
rap_ind = []
rock_ind = []
for idx, row in enumerate(df['artist_genres'].values):
    if 'pop' in row:
        pop_ind.append(1)
    else:
        pop_ind.append(0)
        
    if 'rap' in row or 'hip' in row:
        rap_ind.append(1)
    else:
        rap_ind.append(0)
        
    if 'rock' in row:
        rock_ind.append(1)
    else:
        rock_ind.append(0)
        
# Append array to dataframe
df['pop'] = pop_ind
df['rap'] = rap_ind
df['rock'] = rock_ind

# Artist Popularity 75% is a 75, use for indicator 
df['artist_popularity'].describe()
df['popular_artist_ind'] = (df['artist_popularity'] >= 75)*1

In [4]:
# Sum Up the 100 tracks of info (or less) for each playlist
unique_plists = df['playlist_id'].unique()
columns = ['playlist','name','followers','num_songs','av_song_pop','pct_explicit','avg_dur','av_artist_followers', 'pop_pct', 'rap_pct', 'rock_pct', 'popular_artist_pct']
eda_prelim = pd.DataFrame(index=range(0,len(unique_plists)), columns=columns)

for idx,plist in enumerate(unique_plists):
    eda_prelim.loc[idx]['playlist'] = plist
    eda_prelim.loc[idx]['name'] = df.loc[df['playlist_id'] == plist]['playlist_name'].iloc[0]
    eda_prelim.loc[idx]['followers'] = np.mean(df.loc[df['playlist_id'] == plist]['followers'])
    eda_prelim.loc[idx]['num_songs'] = len(df.loc[df['playlist_id'] == plist])
    eda_prelim.loc[idx]['av_song_pop'] = np.mean(df.loc[df['playlist_id'] == plist]['popularity'])
    eda_prelim.loc[idx]['pct_explicit'] = np.mean(df.loc[df['playlist_id'] == plist]['explicit'])
    eda_prelim.loc[idx]['avg_dur'] = np.mean(df.loc[df['playlist_id'] == plist]['duration_ms'])
    eda_prelim.loc[idx]['av_artist_followers'] = np.mean(df.loc[df['playlist_id'] == plist]['artist_followers'])
    eda_prelim.loc[idx]['pop_pct'] = np.mean(df.loc[df['playlist_id'] == plist]['pop'])
    eda_prelim.loc[idx]['rap_pct'] = np.mean(df.loc[df['playlist_id'] == plist]['rap'])
    eda_prelim.loc[idx]['rock_pct'] = np.mean(df.loc[df['playlist_id'] == plist]['rock'])
    eda_prelim.loc[idx]['avg_num_artists'] = np.mean(df.loc[df['playlist_id'] == plist]['number_of_artists'])
    eda_prelim.loc[idx]['popular_artist_pct'] = np.mean(df.loc[df['playlist_id'] == plist]['popular_artist_ind'])

In [7]:
# log followers
eda_prelim['log_followers'] = np.log(eda_prelim['followers'].astype(float))
eda_prelim['log_artist_follow'] = np.log(eda_prelim['av_artist_followers'].astype(float))
eda_prelim = eda_prelim.dropna(0)
eda_prelim= eda_prelim.replace([np.inf, -np.inf], 0)

#baseline lasso regression 

#splitting into train and test
y_data = eda_prelim['log_followers']
x_data = eda_prelim.drop(['followers', 'log_followers', 'name', 'playlist'], axis = 1)

ind_train, ind_test = train_test_split(range(eda_prelim.shape[0]), train_size=0.75)

#x's
x_train = pd.concat([x_data.iloc[ind_train, :]])
x_test = pd.concat([x_data.iloc[ind_test, :]])

#y's
y_train = y_data.iloc[ind_train]
y_test = y_data.iloc[ind_test]

# Lasso regression and CV
ridge_reg = RidgeCV()
ridge_reg.fit(x_train, y_train)
yhat_train_ridge = ridge_reg.predict(x_train)
yhat_test_ridge = ridge_reg.predict(x_test)
print("Ideal Ridge Alpha: ", ridge_reg.alpha_)
print("Ridge Betas: ", ridge_reg.coef_)
print("Ridge Train R^2: " ,r2_score(y_train, yhat_train_ridge))
print("Ridge Test R^2: " ,r2_score(y_test, yhat_test_ridge))

Ideal Ridge Alpha:  10.0
Ridge Betas:  [  1.83449050e-02   5.69704745e-02   3.18969571e-01   7.68210157e-07
  -6.51925802e-08  -8.46515562e-01  -1.85000675e-01  -7.93473883e-01
   2.60954917e-01   1.48101126e-02]
Ridge Train R^2:  0.101702950264
Ridge Test R^2:  0.0483386774452
