In [3]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVR
from sklearn.model_selection import TimeSeriesSplit
import yaml
import joblib

f = open("params.yaml", "r")
params = yaml.load(f, Loader=yaml.SafeLoader)
f.close()

def read_data(params):
    x_train = joblib.load(params['DUMP_TRAIN'])
    y_train = joblib.load(params['Y_PATH_TRAIN'])
    x_valid = joblib.load(params['DUMP_VALID'])
    y_valid = joblib.load(params['Y_PATH_VALID'])

    return x_train, y_train, x_valid, y_valid
x_train, y_train, x_valid, y_valid  = read_data(params)

def model_search(x_train, y_train):
    ts_cv = TimeSeriesSplit(
        n_splits=5,
        max_train_size=None,
    )

    models = []
    models.append(('RandomForrest', RandomForestRegressor()))
    models.append(('lasso', Lasso()))
    models.append(('ridge', Ridge()))
    models.append(('SVR', LinearSVR()))
    
    results = []
    names = []
    scoring = params["scoring"]
    for name, model in models:
        #kfold = model_selection.KFold(n_splits=5, shuffle=False)
        cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=ts_cv, scoring=scoring)
        results.append(cv_results)
        names.append(name)
    results_mean = [sum(x)/5 for x in results]
    df = pd.DataFrame(list(zip(names, results_mean)), columns =['model', 'score'])
    df = df.set_index('model')
    max_model = df.idxmax()
    best_model = max_model[0]
    print(best_model)
    joblib.dump(best_model, 'output/best_name.pkl')
model_search(x_train, y_train)

ridge
