In [1]:
import pandas as pd
import os
import numpy as np
from math import sqrt

from sklearn.utils import shuffle
from sklearn.linear_model import Lasso, ElasticNet, Ridge
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, KFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
#from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm_notebook
import IPython.display as ipd



In [2]:

DATASET_DIR = 'D:\Study\PMEmo2019\PMEmo2019'

features = pd.read_csv(os.path.join(DATASET_DIR, 'features', 'dynamic_features.csv'))
annotations = pd.read_csv(os.path.join(DATASET_DIR, 'annotations', 'dynamic_annotations.csv'))
dataset = pd.merge(features, annotations, on=['musicId', 'frameTime'])



In [3]:
def rmse(y, y_pred):
    return sqrt(mean_squared_error(y, y_pred))

regressors = {
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Ridge': Ridge(),
    'kNN': KNeighborsRegressor(),
    'SVRrbf': SVR(kernel='rbf', gamma='scale'),
    'SVRpoly': SVR(kernel='poly', gamma='scale'),
    'SVRlinear': SVR(kernel='linear', gamma='scale'),
    'DT': DecisionTreeRegressor(max_depth=5),
    'RF': RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1),
     'MLP': MLPRegressor(hidden_layer_sizes=(200,50), max_iter=2000),
     'AdaBoost': AdaBoostRegressor(n_estimators=10),
}

In [22]:
def cross_val_regression(regressors, features, labels, preprocessfunc):
    columns = list(regressors.keys())
    scores = pd.DataFrame(columns=columns, index=['RMSE'])

    for reg_name, reg in tqdm_notebook(regressors.items(), desc='regressors'):
        scorer = {'rmse': make_scorer(rmse)}
        reg = make_pipeline(*preprocessfunc, reg)
        reg_score = cross_validate(reg, features, labels, scoring=scorer, cv=10, return_train_score=False) 
        scores.loc['RMSE', reg_name] = reg_score['test_rmse'].mean()
#         scores.loc['R', reg_name] = reg_score['test_r'].mean()
    return scores

def format_scores(scores):
    def highlight(s):
        is_min = s == min(s)
#         is_max = s == max(s)
#         is_max_or_min = (is_min | is_max)
        return ['background-color: yellow' if v else '' for v in is_min]
    scores = scores.style.apply(highlight, axis=1, subset=pd.IndexSlice[:, :scores.columns[-2]])
    return scores.format('{:.3f}')


In [23]:
def regression_results(regressors, trainset, testset, featureNames, labelName, filePrefix, preprocessfunc):
    X_train = trainset[featureNames]
    y_train = trainset[labelName]
    X_test = testset[featureNames]
    y_test = testset[labelName]

    columns = ['musicId', 'y_test'] + list(regressors.keys())
    results = pd.DataFrame(columns=columns)
    results['musicId'] = testset['musicId']
    results['y_test'] = y_test.values
    
    for reg_name, reg in tqdm_notebook(regressors.items(), desc='regressors'):
        reg = make_pipeline(*preprocessfunc, reg)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        results[reg_name] = y_pred
        results.to_csv(os.path.join('temp_results',f'{filePrefix}_regression_results_{labelName}.csv'))

In [24]:
def compute_rmse_across_songs(resultsFile):
    results = pd.read_csv(resultsFile,index_col=0).dropna(axis=1, how='any')
    columns = results.columns[2:]
    scores = pd.DataFrame(columns=columns, index=['rmse_across_segments', 'rmse_across_songs'])
    rmse_across_songs = {}
    testsongs_num = len(results['musicId'].unique())

    for reg_name in columns:
        scores.loc['rmse_across_segments', reg_name] = rmse(results['y_test'], results[reg_name])
        rmse_across_songs[reg_name] = 0

    for i, g in results.groupby('musicId'):
        for reg_name in columns:
            rmse_across_songs[reg_name] += rmse(g['y_test'], g[reg_name])

    for reg_name in columns:
        scores.loc['rmse_across_songs', reg_name] = rmse_across_songs[reg_name]/testsongs_num
    
    mean_rmse = scores.mean(axis=1)
    std_rmse = scores.std(axis=1)
    
    scores['Mean'] = mean_rmse
    scores['std'] = std_rmse
    ipd.display(format_scores(scores))

In [25]:
songs = dataset['musicId'].unique()
songs = shuffle(songs, random_state=3)
test_num = round(len(songs)*0.1)
testsongs = songs[:test_num]
print(list(testsongs))

[63, 490, 34, 743, 104, 177, 79, 894, 14, 668, 683, 151, 504, 516, 355, 98, 97, 579, 892, 837, 152, 169, 388, 391, 561, 850, 985, 958, 572, 514, 625, 791, 517, 507, 501, 1000, 803, 457, 403, 670, 51, 798, 59, 531, 466, 503, 794, 568, 279, 103, 350, 917, 428, 417, 393, 571, 354, 283, 906, 149, 56, 128, 742, 993, 94, 754, 199, 57, 576, 463, 284, 126, 488, 253, 227, 730, 861]


In [27]:
ftestset = dataset['musicId'].apply(lambda x: x in testsongs)
testset = dataset[iftestset]
trainset = dataset[~iftestset]
prefunc = [StandardScaler()]
featureNames = dataset.columns[2:262]

print('In Arousal dimension...')
regression_results(regressors, trainset, testset, featureNames, 'Arousal(mean)', 'audio', prefunc)

print('In Valence dimension...')
regression_results(regressors, trainset, testset, featureNames, 'Valence(mean)', 'audio', prefunc)

In Arousal dimension...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


HBox(children=(FloatProgress(value=0.0, description='regressors', max=11.0, style=ProgressStyle(description_wi…


In Valence dimension...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


HBox(children=(FloatProgress(value=0.0, description='regressors', max=11.0, style=ProgressStyle(description_wi…




In [28]:
rint('In Arousal dimension...')
compute_rmse_across_songs(os.path.join('temp_results','audio_regression_results_Arousal(mean).csv'))
print('In Valence dimension...')
compute_rmse_across_songs(os.path.join('temp_results','audio_regression_results_Valence(mean).csv'))

NameError: name 'rint' is not defined

In [29]:
print('In Arousal dimension...')
compute_rmse_across_songs(os.path.join('temp_results','audio_regression_results_Arousal(mean).csv'))
print('In Valence dimension...')
compute_rmse_across_songs(os.path.join('temp_results','audio_regression_results_Valence(mean).csv'))

In Arousal dimension...


Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,MLP,AdaBoost,Mean,std
rmse_across_segments,0.187,0.187,0.131,0.149,0.128,0.159,0.131,0.134,0.154,0.143,0.133,0.148,0.021
rmse_across_songs,0.158,0.158,0.108,0.128,0.107,0.136,0.108,0.119,0.13,0.126,0.117,0.127,0.018


In Valence dimension...


Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,MLP,AdaBoost,Mean,std
rmse_across_segments,0.157,0.157,0.139,0.147,0.131,0.158,0.138,0.141,0.14,0.148,0.14,0.145,0.009
rmse_across_songs,0.128,0.128,0.115,0.123,0.109,0.134,0.113,0.118,0.114,0.129,0.116,0.121,0.008
