In [17]:
# import libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

# evaluate regression models 
def main():

    # set dataframe
    dataset = load_dataset
    X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
    y = pd.DataFrame(dataset.target, columns=['y'])

    # cross-validation(holdout)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

    # make pipelines for modeling
    pipe_ols = Pipeline([('scl',StandardScaler()),('est',LinearRegression())])
    pipe_ridge = Pipeline([('scl',StandardScaler()),('est',Ridge(random_state=1))])
    pipe_rf = Pipeline([('scl',StandardScaler()),('est',RandomForestRegressor(random_state=1))])
    pipe_gbr = Pipeline([('scl',StandardScaler()),('est',GradientBoostingRegressor(random_state=1))])

    # make models    
    pipe_ols.fit(X_train, y_train.as_matrix().ravel())
    pipe_ridge.fit(X_train, y_train.as_matrix().ravel())
    pipe_rf.fit(X_train, y_train.as_matrix().ravel())
    pipe_gbr.fit(X_train, y_train.as_matrix().ravel())

    # get R2 score
    y_true = y_test.as_matrix().ravel()

    pipe_scores = {}
    pipe_scores['OLS'] = r2_score(y_true, pipe_ols.predict(X_test))
    pipe_scores['Ridge'] = r2_score(y_true, pipe_ridge.predict(X_test))
    pipe_scores['RandomForest'] = r2_score(y_true, pipe_rf.predict(X_test))
    pipe_scores['GradinetBoosting'] = r2_score(y_true, pipe_gbr.predict(X_test))
   
    # print scores sorted descend
    for pipe, score in sorted(pipe_scores.items(), key=lambda x: x[1], reverse=True):
        print('%s: %.3f' %(pipe, score))

    # output best model
    best_scores = [kv for kv in pipe_scores.items() if kv[1] == max(pipe_scores.values())]
    # dump pkl
    with open( 'best_models.pickle', mode='wb') as f:
        pickle.dump(best_scores, f)    
    
if __name__ == '__main__':

  # set parameter
  # import Sample Data to learn models
  load_dataset = datasets.load_boston()

  # main proc
  main()


GradinetBoosting: 0.779
RandomForest: 0.707
OLS: 0.589
Ridge: 0.588
