In [None]:
# Part 1: Performance Metrics in Regression

### Chanil Park

## regression methods
- linear regression
- k-neighbors regression
- Ridge regression
- decision tree regression
- random forest regression
- gradient Boosting regression
- SGD regression
- support vector regression (SVR)
- linear SVR
- multi-layer perceptron regression.

In [261]:
# magic commands, sets the backend of matplotlib to the 'inline' backend
%matplotlib inline

In [293]:
"""

Performance Metrics in Regression
x = weight and y = height.

"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

#from utilities.losses import compute_loss
#from utilities.optimizers import gradient_descent, pso, mini_batch_gradient_descent
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# General settings
#from utilities.visualization import visualize_train, visualize_test


In [292]:
# Initialize seed
seed = 1000
# Freeze the random seed
random.seed(seed)
np.random.seed(seed)
train_test_split_test_size = 0.3

# Training settings
alpha = 0.9  # step size
max_iters = 100  # max iterations
tol = 0.1 #SGDRegressor, Ridge
verbose = 0


In [280]:
def load_data():
    """
    Load Data from CSV
    :return: df    a panda data frame
    """
    # File Path is dependent on the starting directory path of Jupyter.
    # df = pd.read_csv("../data/diamonds.csv")
    df = pd.read_csv("../Part 1 Performance Metrics in Regression/data/diamonds.csv")
    return df

In [281]:
def standardize(train_data, test_data):
    """
    Standardize Data Set
    """
    train_mean = train_data.mean()
    train_std = train_data.std()
    train_data = (train_data - train_mean) / train_std
    test_data = (test_data - train_mean) / train_std
    return train_data, test_data
    

In [282]:
def firstPreparation(data):
    """
    First preperation, drop the class and prepare data
    """
    data_full = data.copy()
    data = data.drop(["price"], axis = 1)
    labels = data_full["price"]
    return data_full, data, labels

In [283]:
def data_preprocess(data):
    """
    Data preprocess:
        1. Split the entire dataset into train and test
        2. Split outputs and inputs
        3. Standardize train and test
        4. Add intercept dummy for computation convenience
    :param data: the given dataset (format: panda DataFrame)
    :return: train_data       train data contains only inputs
             train_labels     train data contains only labels
             test_data        test data contains only inputs
             test_labels      test data contains only labels
             train_data_full       train data (full) contains both inputs and labels
             test_data_full       test data (full) contains both inputs and labels
    """
    # Split the data into train and test
    train_data, test_data = train_test_split(data, test_size = train_test_split_test_size, random_state=seed)

    # Pre-process data (both train and test)
    train_data_full, train_data, train_labels = firstPreparation(train_data)
    test_data_full, test_data, test_labels = firstPreparation(test_data)
    
    # Handling categorized data
    train_data = pd.get_dummies(train_data, columns=['cut', 'color', 'clarity'])
    # print(train_data.head())
    test_data = pd.get_dummies(test_data, columns=['cut', 'color', 'clarity'])    
    # print(test_data.head())
    
    # Standardize the inputs
    train_data, test_data = standardize(train_data, test_data)

    # Tricks: add dummy intercept to both train and test
    train_data['intercept_dummy'] = pd.Series(1.0, index = train_data.index)
    test_data['intercept_dummy'] = pd.Series(1.0, index = test_data.index)
    # print(train_data.head())
    # print(test_data.head())    
    return train_data, train_labels, test_data, test_labels, train_data_full, test_data_full

In [284]:
def predict(x, thetas):
    return x.dot(thetas)

In [285]:
def applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels):
    start_time = datetime.datetime.now()  # Track learning starting time
    model.fit(train_data, train_labels)
    prediction = model.predict(test_data)
    #pred_label = predict(test_data, gradient_ws[-1])
    end_time = datetime.datetime.now()  # Track learning ending time
    exection_time = round((end_time - start_time).total_seconds(), 2)  # Track execution time
    mse = round(mean_squared_error(test_labels, prediction), 2)
    rmse = round(np.sqrt(mse), 2)
    r2_error = round(r2_score(test_labels, prediction), 2)
    mae = round(mean_absolute_error(test_labels, prediction), 2)
    print(model)
    print("Exection time: ", exection_time)
    #print("Coefficients: ", model.coef_)
    #print("Intercept: ", model.intercept_)
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("R2: ", r2_error)  # R2 should be maximize
    print("MAE: ", mae)
    print("\n")
    #plt.scatter(x=test_data_full["carat"], y=test_data_full["price"], color='blue')
    #plt.plot(test_data_full["carat"], pred_label, color='red', linewidth=2)

In [294]:
if __name__ == '__main__':
    # load data 
    data = load_data()

    # Preprocess the data
    train_data, train_labels, test_data, test_labels, train_data_full, test_data_full = data_preprocess(data)

    # Build baseline model
    model = LinearRegression()
    applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels)
    model = KNeighborsRegressor()
    applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels)
    model = Ridge(max_iter=max_iters, alpha=alpha, tol=tol)
    applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels)
    model = DecisionTreeRegressor()
    applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels)
    model = RandomForestRegressor()
    applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels)
    model = GradientBoostingRegressor(alpha=alpha)
    applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels)
    model = SGDRegressor(max_iter=100, alpha=alpha, tol=0.0001)
    applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels)
    model = SVR(max_iter=max_iters, tol=tol)
    applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels)
    model = LinearSVR(max_iter=max_iters, tol=tol)
    applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels)
    model = MLPRegressor(max_iter=max_iters, alpha=alpha, tol=tol)
    applyModelThenResult(model, train_data, train_labels, test_data_full, test_data, test_labels)
    

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
Exection time:  0.05
MSE:  1302827.0
RMSE:  1141.41
R2:  0.92
MAE:  746.96


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')
Exection time:  9.55
MSE:  751250.25
RMSE:  866.75
R2:  0.95
MAE:  399.62


Ridge(alpha=0.9, copy_X=True, fit_intercept=True, max_iter=100,
   normalize=False, random_state=None, solver='auto', tol=0.1)
Exection time:  0.02
MSE:  1302617.9
RMSE:  1141.32
R2:  0.92
MAE:  747.44


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
Exection time:  0.37
MSE:  800.72
RMSE:  28.3
R2:  1.0
MAE:  2.67


RandomForestRegressor(bo



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=100, shrinking=True, tol=0.1, verbose=False)
Exection time:  0.68
MSE:  45123282.17
RMSE:  6717.39
R2:  -1.81
MAE:  6235.42


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=100,
     random_state=None, tol=0.1, verbose=0)
Exection time:  0.05
MSE:  2260896.52
RMSE:  1503.63
R2:  0.86
MAE:  793.17


MLPRegressor(activation='relu', alpha=0.9, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.1, validation_fraction=0.1,
       verbose=False, warm_start=False)
Exection time:  19.61
MSE:  532593.91
RMSE:  729.79
R2:  0.97
MAE:  423.45




