In [1]:
import sys
import sys
sys.path
sys.path.append('../')

import numpy as np
from diamonds import experiments, normal_equation

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
scoring = {
    	'Negative MSE': 'neg_mean_squared_error',
    	'Negative MAE': 'neg_mean_absolute_error',
    	'R2': 'r2'
	}

val_size = .15
params = {}

  from numpy.core.umath_tests import inner1d


## Discution 1
  - Using the SKlearn SGDregressor with basic params we are now comparing the results.
  - Kept only the best results from the first experiment to run the GridSearch for the parameters
  - The Log(Y) kept the algorithm more robust reducing the errors mean value
  - The Scale kept the algorithm more robust reducing the errors mean value
  - The syntetic features wherever they appeards reduces the standard deviation from the MAE/MSE and RSME


In [2]:
regr = experiments.get_sklearn_sgd(params)
regr

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=None, n_iter=None, penalty=None,
       power_t=0.25, random_state=None, shuffle=True, tol=None,
       verbose=True, warm_start=False)

## Adding syntect features ['volume', 'ratio_xy']

In [3]:
X = experiments.load_train_data()
folds, (X_train, X_test, y_train, y_test) = experiments.gen_splits(X, scale=True, 
                                                             exclude_features=['ratio_xz'])

### Log(price)

In [4]:
experiments.kfold_evaluate(regr, folds, scoring, log_y=True)

Evaluating 0
Evaluating 1
Evaluating 2
Evaluating 3
Evaluating 4
RMSE: 	 1623.5299 +/- 98.0607
MSE:  	 2645465.1322 +/- 314672.4479
MAE:  	 719.4283 +/- 23.8322
R2:   	 0.8138 +/- 0.0222




## Adding syntect features ['volume', 'ratio_xz']

In [5]:
X = experiments.load_train_data()
folds, (X_train, X_test, y_train, y_test) = experiments.gen_splits(X, scale=True, 
                                                             exclude_features=['ratio_xy'])

### Log(price)

In [6]:
experiments.kfold_evaluate(regr, folds, scoring, log_y=True)

Evaluating 0
Evaluating 1
Evaluating 2
Evaluating 3
Evaluating 4
RMSE: 	 1588.3106 +/- 101.2143
MSE:  	 2532974.9834 +/- 336302.1876
MAE:  	 705.9682 +/- 29.9954
R2:   	 0.8218 +/- 0.0224




## Adding syntect features ['volume', 'ratio_xy', 'ratio_xz']

In [7]:
X = experiments.load_train_data()
folds, (X_train, X_test, y_train, y_test) = experiments.gen_splits(X, scale=True, 
                                                             exclude_features=[])

### Log(price)

In [8]:
experiments.kfold_evaluate(regr, folds, scoring, log_y=True)

Evaluating 0
Evaluating 1
Evaluating 2
Evaluating 3
Evaluating 4
RMSE: 	 1589.3804 +/- 111.7284
MSE:  	 2538613.2083 +/- 366494.7774
MAE:  	 710.8763 +/- 31.5879
R2:   	 0.8214 +/- 0.0244




----

## Discution 2
  - The results doesn't appear to have an statistical difference between then, since the mean and std deviation are practically the same. Although the result with features included has a better R2. 
  - The SGD goes almost to the same minimal as the normal equation results.
  - Whe are now running the GridSearch CV for the SGD to look for better parameters and will be using the last dataset above. 


In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'learning_rate':['invscaling', 'optimal', 'constant'],
    'eta0': [0.1, 0.05, 0.01], # since 0.01 had a good result in the previous results 
    'penalty': ['l2', 'l1', None], # Those penalties are easier to implement if needed
    'loss': ['squared_loss'], # Since we are running the MSE loss function for the Custom Implementing
    'max_iter':[5000] # Fixed the number of iterations to avoid the long time executions
}

scoring = {
        '-MSE': 'neg_mean_squared_error',
        '-MAE': 'neg_mean_absolute_error',
        'R2': 'r2'
    }

# We are using R2 to refit because it gave a better view of the results above when compared with the MSE and MAE
regr = GridSearchCV(regr, params, cv=5, scoring=scoring, refit='R2', n_jobs=-1, verbose=True)
regr.fit(X_train, np.log(y_train))


Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
regr.best_estimator_

In [None]:
regr.best_params_

In [None]:
regr.cv_results_.keys()

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

columns = [
'param_learning_rate',
'param_max_iter',
'param_loss',
'param_eta0',
'param_penalty',
'rank_test_-MSE',
'rank_test_-MAE',
'rank_test_R2',
'std_test_-MSE',
'std_test_-MAE',
'std_test_R2',
'mean_test_-MSE',
'mean_test_-MAE',
'mean_test_R2', 
'mean_fit_time']

results = pd.DataFrame(regr.cv_results_)
top10 = results[columns].sort_values(by=['rank_test_R2', 'mean_test_R2']).head(10).copy()
top10.sort_values(by=['rank_test_R2', 'mean_test_R2'])
top10.set_index('rank_test_R2', inplace=True, drop=True)
top10

In [None]:
top10.mean_fit_time.plot.bar()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
top10.mean_test_R2.plot.bar(yerr=top10.std_test_R2)

In [None]:
plt.figure(figsize=(10,5))
top10.std_test_R2.plot.bar()

In [None]:
top10

## Refining the search
   - Given the above rank we are selecting the 3 top configurations and re-running the grid with more iterations

In [None]:
params = {}
regr = experiments.get_sklearn_sgd(params)
regr.verbose = False
params = {
    'learning_rate':['invscaling','constant'],
    'eta0': [0.1, 0.05, 0.01], # since 0.01 had a good result in the previous results 
    'penalty': [None], # Those penalties are easier to implement if needed
    'loss': ['squared_loss'], # Since we are running the MSE loss function for the Custom Implementing
    'max_iter':[100000] # Fixed the number of iterations to avoid the long time executions
}

scoring = {
        '-MSE': 'neg_mean_squared_error',
        '-MAE': 'neg_mean_absolute_error',
        'R2': 'r2'
    }

# We are using R2 to refit because it gave a better view of the results above when compared with the MSE and MAE
regr = GridSearchCV(regr, params, cv=5, scoring=scoring, refit='R2', n_jobs=-1, verbose=True)
regr.fit(X_train, np.log(y_train))

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

columns = [
'param_learning_rate',
'param_max_iter',
'param_loss',
'param_eta0',
'param_penalty',
'rank_test_-MSE',
'rank_test_-MAE',
'rank_test_R2',
'std_test_-MSE',
'std_test_-MAE',
'std_test_R2',
'mean_test_-MSE',
'mean_test_-MAE',
'mean_test_R2', 
'mean_fit_time']

results = pd.DataFrame(regr.cv_results_)
top10 = results[columns].sort_values(by=['rank_test_R2', 'mean_test_R2']).head(10).copy()
top10.sort_values(by=['rank_test_R2', 'mean_test_R2'])
top10.set_index('rank_test_R2', inplace=True, drop=True)
top10

In [None]:
top10.mean_fit_time.plot.bar()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
top10.mean_test_R2.plot.bar(yerr=top10.std_test_R2)

In [None]:
plt.figure(figsize=(10,5))
top10.std_test_R2.plot.bar()

In [None]:
top10.sort_values(['rank_test_R2'])

In [None]:
top10.sort_values(['rank_test_-MAE'])

In [None]:
top10.sort_values(['rank_test_-MSE'])

## Best parameters found 

In [None]:
regr.best_params_