## Tabular Model Testing:
---

- Linear Regression
- Random Forest
- XGBoost

- Data Features:
    - Tempo
    - Beat Length
    - Beat Difference
    - 30 seconds of wav data (30 columns)
- Target = Valence

---

In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pickle

from audiologic.utils import cv_test, score_model


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Data
#data = pd.read_csv('data/full_data_df.csv')
data = pd.read_csv('data/final_feature_df.csv')
music = data.copy()[(data['tempo'] != 0)].dropna().drop_duplicates().reset_index(drop=True)

# Split Data
info_cols = ['id', 'file', 'source']
predictor_cols = ['tempo', 'beat_length', 'beat_diff',
                    'centroid', 'd_centroid', 'rolloff', 'd_rolloff', 'rolloff_mid',
                    'd_rolloff_mid', 'contrast_0', 'd_contrast_0', 'contrast_1',
                    'd_contrast_1', 'contrast_2', 'd_contrast_2', 'contrast_3',
                    'd_contrast_3', 'contrast_4', 'd_contrast_4', 'contrast_5',
                    'd_contrast_5', 'contrast_6', 'd_contrast_6', 'mfcc_0', 'd_mfcc_0',
                    'mfcc_1', 'd_mfcc_1', 'mfcc_2', 'd_mfcc_2', 'mfcc_3', 'd_mfcc_3',
                    'mfcc_4', 'd_mfcc_4', 'mfcc_5', 'd_mfcc_5', 'mfcc_6', 'd_mfcc_6',
                    'mfcc_7', 'd_mfcc_7', 'mfcc_8', 'd_mfcc_8', 'mfcc_9', 'd_mfcc_9', 'rms', 'd_rms']
target = ['valence']
scaler = MinMaxScaler()#
#scaler = StandardScaler()

X = scaler.fit_transform(music[predictor_cols])
y = music[target]

xtrain, xvv, ytrain, yvv = train_test_split(X, y, test_size=0.3, random_state=12)
xval, xtest, yval, ytest = train_test_split(xvv, yvv, test_size=0.5, random_state=12)
xx = pd.concat([pd.DataFrame(xtrain), pd.DataFrame(xval)])
yy = pd.concat([ytrain, yval])


In [6]:
# Initial Model Tests
def quick_test(model, name="Model", params=None, xys=(xtrain, ytrain, xval, yval), cross_val=False):
    if params is not None:
        mod = model(**params)
    else:
        mod = model()
    print(f"-- {name} --")
    #print(f"RMSE = {rmse}")
    #print(f"MAE = {mae}")
    #print(f"R2 = {r2_score(xys[3], preds)}")
    #print('----')
    if cross_val:
        cv_test(mod, xys[0], np.ravel(xys[1]))
    else:
        mod.fit(xys[0], np.ravel(xys[1]))
        preds = mod.predict(xys[2])
        score_model(xys[3], preds)
    return mod
    

# Linear Regression
#for m, n in zip(
#        [LinearRegression, RandomForestRegressor],#, xgb.XGBRegressor, CatBoostRegressor],
#        ['LinReg', 'RandomForest']#, 'XGB', 'CatBoost']
#    ):
#    if n == "CatBoost":
#        quick_test(m, n, params={'verbose':0}, cross_val=True)
#    #elif n == 'RandomForest':
#    #    quick_test(m, n, params={'n_estimators': 300, 'max_depth': 12,'random_state': 31}, cross_val=True)
#    else:
#        quick_test(m, n, cross_val=True)
#

In [19]:
# Feature Selection
for m, n in zip([LinearRegression, RandomForestRegressor, xgb.XGBRegressor, CatBoostRegressor], ['LinReg', 'RandomForest', 'XGB', 'CatBoost']):
    print(f"-- {n} --")
    for thresh in [0.01, 0.025]:
        xtcols = predictor_cols
        if n == 'CatBoost':
            selector = SelectFromModel(estimator=m(verbose=0), threshold=thresh).fit(xtrain, np.ravel(ytrain))
        else:
            selector = SelectFromModel(estimator=m(), threshold=thresh).fit(xtrain, np.ravel(ytrain))
        xtrain_trans = selector.transform(xtrain)
        xval_trans = selector.transform(xval)
        mask = selector.get_support()
        cols = np.array(predictor_cols)[mask]
        newmod = m().fit(xtrain_trans, np.ravel(ytrain))
        predictions = newmod.predict(xval_trans)
        print(f"--Thresh={thresh} --> Droppped columns = {list(set(predictor_cols) - set(cols))}")
        mod_scores = score_model(yval, predictions, ['mae'])
    print('')

-- LinReg --
--Thresh=0.01 --> Droppped columns = ['d_contrast_3', 'mfcc_6']
Mean Absolute Error = 1.24623882549769
--Thresh=0.025 --> Droppped columns = ['d_contrast_3', 'contrast_2', 'mfcc_5', 'd_contrast_6', 'd_mfcc_8', 'd_mfcc_1', 'rolloff', 'd_contrast_0', 'mfcc_6', 'contrast_1', 'd_mfcc_9', 'd_mfcc_5']
Mean Absolute Error = 1.2462333958654659

-- RandomForest --
--Thresh=0.01 --> Droppped columns = ['tempo']
Mean Absolute Error = 1.215892694961561
--Thresh=0.025 --> Droppped columns = ['d_contrast_6', 'd_mfcc_1', 'beat_length', 'contrast_2', 'contrast_6', 'd_mfcc_7', 'd_contrast_1', 'd_rms', 'd_centroid', 'contrast_4', 'd_mfcc_2', 'd_rolloff', 'd_mfcc_9', 'mfcc_2', 'tempo', 'rolloff', 'd_contrast_0', 'mfcc_6', 'd_rolloff_mid', 'contrast_1', 'd_mfcc_6', 'd_contrast_5', 'mfcc_8', 'd_contrast_3', 'mfcc_5', 'd_contrast_4', 'mfcc_1', 'd_mfcc_4', 'd_contrast_2', 'd_mfcc_8', 'd_mfcc_3', 'contrast_3', 'mfcc_3', 'beat_diff', 'd_mfcc_5']
Mean Absolute Error = 1.2427939895706404

-- XGB --


In [4]:
# RF Tuning
rf_params = {
    'n_estimators': [300],
    'max_depth': [12], 
    'random_state': [31],
    #'bootstrap': [True],
    #'max_samples': [0.9]
} 

rf_clf = GridSearchCV(RandomForestRegressor(), rf_params, 
                      n_jobs=-1, 
                      cv=5, 
                      refit=False, 
                      verbose=3, 
                      scoring='neg_root_mean_squared_error')
rf_clf.fit(xx, np.ravel(yy))

print(f"RMSE = {rf_clf.best_score_} --> {rf_clf.best_params_}")

# score history - initial Data
# untuned
# -1.575059
# BEST = RMSE = 1.5688025457719175 --> {'max_depth': 20, 'n_estimators': 400, 'random_state': 12}

# score history - more features
# untuned
# 1.57 / 1.26

# RMSE = 1.57 --> {'max_depth': 8, 'n_estimators': 150, 'random_state': 31}
# RMSE = 1.5447598732825507 --> {'max_depth': 12, 'n_estimators': 300, 'random_state': 31}
# RMSE = 1.5683283374462396 --> {'max_depth': 10, 'n_estimators': 250, 'random_state': 31}

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 3/5] END bootstrap=True, max_depth=12, max_samples=0.9, n_estimators=300, random_state=31;, score=-1.553 total time= 2.6min
[CV 1/5] END bootstrap=True, max_depth=12, max_samples=0.9, n_estimators=300, random_state=31;, score=-1.563 total time= 2.6min
[CV 4/5] END bootstrap=True, max_depth=12, max_samples=0.9, n_estimators=300, random_state=31;, score=-1.584 total time= 2.6min
[CV 2/5] END bootstrap=True, max_depth=12, max_samples=0.9, n_estimators=300, random_state=31;, score=-1.589 total time= 2.6min
[CV 5/5] END bootstrap=True, max_depth=12, max_samples=0.9, n_estimators=300, random_state=31;, score=-1.551 total time= 1.3min
RMSE = -1.568233803660233 --> {'bootstrap': True, 'max_depth': 12, 'max_samples': 0.9, 'n_estimators': 300, 'random_state': 31}


In [11]:
# XGB Tuning
xgb_params = {
    'n_estimators': [200],
    'max_depth': [14], 
    'learning_rate': [0.025],
    'subsample': [0.9],
    'colsample_bytree': [0.8],
    'random_state': [31]
} 

xgb_clf = GridSearchCV(xgb.XGBRegressor(), xgb_params,
                       n_jobs=-1, 
                       cv=5, 
                       refit=False, 
                       verbose=3, 
                       scoring='neg_root_mean_squared_error')

#xgb_clf.fit(xx, np.ravel(yy))
#print(f"RMSE = {xgb_clf.best_score_} --> {xgb_clf.best_params_}")

# BEST = RMSE = 1.576640708543763 --> {'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 14, 'n_estimators': 200, 'random_state': 31, 'subsample': 0.9}
# -1.5798379974304275 --> {'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 16, 'n_estimators': 300, 'random_state': 31, 'subsample': 0.9}
# -1.5803891048337926 --> {'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 13, 'n_estimators': 250, 'random_state': 31, 'subsample': 0.9}


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 3/5] END colsample_bytree=0.8, learning_rate=0.025, max_depth=14, n_estimators=200, random_state=31, subsample=0.9;, score=-1.599 total time=  51.7s
[CV 4/5] END colsample_bytree=0.8, learning_rate=0.025, max_depth=14, n_estimators=200, random_state=31, subsample=0.9;, score=-1.532 total time=  51.9s
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.025, max_depth=14, n_estimators=200, random_state=31, subsample=0.9;, score=-1.576 total time=  51.9s
[CV 2/5] END colsample_bytree=0.8, learning_rate=0.025, max_depth=14, n_estimators=200, random_state=31, subsample=0.9;, score=-1.582 total time=  51.9s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.025, max_depth=14, n_estimators=200, random_state=31, subsample=0.9;, score=-1.528 total time=  23.5s
RMSE = -1.5634333106583571 --> {'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 14, 'n_estimators': 200, 'random_state': 31, 'subsample': 0.9}


In [7]:
# Model Saving
rf_final = quick_test(RandomForestRegressor, 'Random Forest', 
                      {'max_depth': 12, 'n_estimators': 300, 'random_state': 31}, 
                      xys=(xx, yy, xtest, ytest))
#xgb_final = quick_test(xgb.XGBRegressor, 'XGB', 
#                       {'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 14, 'n_estimators': 200, 'random_state': 31, 'subsample': 0.9}, 
#                       xys=(xx, yy, xtest, ytest))


-- Random Forest --
Mean Absolute Error = 1.2956118063416027
Root Mean Squared Error = 1.5693321403871818
R-Squared = 0.10064779279466174


In [8]:
#pickle.dump(rf_final, open('audiologic/models/rf_audio_model.pkl', 'wb'))
#pickle.dump(xgb_final, open('audiologic/models/xgb_audio_model.pkl', 'wb'))