In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model as lm
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_diabetes
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV

In [None]:
dataset = load_diabetes()
data_x = dataset.data
print(data_x.shape)
data_y = dataset.target
print(data_y.shape)
print("Number of nan's in dataset:",np.sum(np.isnan(data_x)))

In [None]:
#plot each pair (column,target)
for i in range(data_x.shape[1]):
    print("Feature %d" % (i))
    plt.plot(data_x[:,i],data_y,'x')
    #plt.xlabel(data_x.columns[i])
    plt.ylabel("Target")
    plt.grid()
    plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=0,shuffle=True)

scaler = MinMaxScaler().fit(X_train)

X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [None]:
# Linear regression OLS
lr=lm.LinearRegression()
mse_train_ols =cross_val_score(lr, X_train, y_train, cv=10,scoring = 'neg_mean_squared_error')
print("Mean OLS Train MSE: %f (%f)" %(-np.mean(mse_train_ols),np.std(mse_train_ols)))

In [None]:
# Ridge tune alpha
clf = RidgeCV(cv=10,alphas=[1e-3, 1e-2, 1e-1,1e0, 1e1,1e2,1e3]).fit(X_train_norm, y_train)
print("Ridge alpha=",clf.alpha_)

In [None]:
# Ridge regression
ridge = lm.Ridge(alpha=1.0,normalize=False)
mse_train_ridge =cross_val_score(ridge, X_train_norm, y_train, cv=10,scoring = 'neg_mean_squared_error')
print("Mean Ridge Train MSE: %f (%f)" %(-np.mean(mse_train_ridge),np.std(mse_train_ridge)))

In [None]:
# Logistic Regression
logr = lm.LogisticRegression(penalty='l2' ,C =100.0, multi_class = 'multinomial', solver = 'lbfgs', max_iter = 10000,tol = 1e-3, verbose=1)
mse_train_logr =cross_val_score(logr, X_train_norm, y_train, cv=5,scoring = 'neg_mean_squared_error')
print("Mean Logistic Regression Train MSE",-np.mean(mse_train_logr))

In [None]:
# TUNE SVR
Cs = [2**(-3),2**(-2),2**(-1), 1,2**(1),2**(2),2**(3),2**(4),2**(5),2**(6),2**(7),2**(8),2**(9),2**(10),2**(11),2**(12),2**(13),2**(14),2**(15)]
eps = [2**(-15),2**(-14),2**(-13),2**(-12),2**(-11),2**(-10),2**(-9),2**(-8),2**(-7),2**(-6),2**(-5),2**(-4),2**(-3),2**(-2),2**(-1),2**(0),2**(1),2**(2),2**(3)]

parameters = {'C':Cs,'epsilon':eps}
svc = svm.SVR( kernel='rbf',gamma='scale')
clf = GridSearchCV(svc, parameters, cv=5,verbose=2,n_jobs=-1,scoring = 'neg_mean_squared_error')
clf.fit(X_train_norm, y_train)
print("The best parameters are %s with a score of %0.2f"
      % (clf.best_params_, clf.best_score_))


In [None]:
# SVR
clf2 = svm.SVR(C=64, epsilon=6*1e-5,gamma='scale')
clf2.fit(X_train_norm, y_train)
y_hat = clf2.predict(X_test_norm)
mse_svr=np.mean((y_test-y_hat)**2)
mse_svr
mse_train_svm =cross_val_score(clf2, X_train_norm, y_train, cv=10,scoring = 'neg_mean_squared_error')
print("Mean SVR Train MSE: %f (%f)",(-np.mean(mse_train_svm),np.std(mse_train_svm)))

In [None]:
# Random Forests tune
min_samples_split=[0.1*x for x in range(1,6)]
max_depth=[None]
max_features=[x for x in range(1,11)]
min_samples_leaf=[0.1*x for x in range(1,6)]
n_estimators=[50]
parameters = {'n_estimators':n_estimators,'max_depth':max_depth,'min_samples_split':min_samples_split,'max_features':max_features,'min_samples_leaf':min_samples_leaf}
rrforest = ensemble.RandomForestRegressor(criterion='mse',
                                          min_weight_fraction_leaf=0.0,
                                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                                          min_impurity_split=None, bootstrap=True, oob_score=True,
                                          n_jobs=1, random_state=0, verbose=0,
                                          warm_start=False)

clf = GridSearchCV(rrforest, parameters, cv=10,scoring='neg_mean_squared_error',verbose=2,n_jobs=-1)
clf.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.2f"
      % (clf.best_params_, clf.best_score_))

In [None]:
# Random Forests
rrforest = ensemble.RandomForestRegressor(n_estimators=50, criterion='mse', max_depth=None,
                                          min_samples_split=0.1, min_samples_leaf=0.1,
                                          min_weight_fraction_leaf=0.0, max_features=6,
                                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                                          min_impurity_split=None, bootstrap=True, oob_score=True,
                                          n_jobs=1, random_state=0, verbose=0,
                                          warm_start=False)
rrforest_scores = -cross_val_score(rrforest, X_train, y_train, cv=10,
                                   scoring='neg_mean_squared_error')
print("Random forests on dataset, estimated MSE: %f (%f)" % (np.mean(rrforest_scores), np.std(rrforest_scores)))

rrforest.fit(X_train,y_train)
y_hat = rrforest.predict(X_test)
mse_rf=np.mean((y_test-y_hat)**2)
print("Random Forests TEST MSE:",mse_rf)

In [None]:
# Model comparisons with results in test
#Define function for MAPE calculation
def mape(y_true,y_pred):
	y_true,y_pred = np.array(y_true), np.array(y_pred)
	return np.mean(np.abs((y_true-y_pred)/y_true))

# OLS
lr=lm.LinearRegression().fit(X_train,y_train)
yhat=lr.predict(X_test)
mse_ols=np.mean((y_test-yhat)**2)
mae_ols=np.mean(np.fabs(y_test-yhat))
my_mape=mape(y_test,yhat)
print('MSE TEST OLS: %f' % mse_ols)
#print('MAE OLS: %f' % mae_ols)
#print("MAPE OLS: %f" % my_mape)

#RIDGE 
ridge = lm.Ridge(alpha=1.0,normalize=False)
ridge.fit(X_train_norm,y_train)
yhat=ridge.predict(X_test_norm)
mse_ridge=np.mean((y_test-yhat)**2)
mae_ridge=np.mean(np.fabs(y_test-yhat))
my_mape=mape(y_test,yhat)
print('MSE TEST RIDGE: %f' % (mse_ridge))

#RANDOM FORESTS
rrforest.fit(X_train,y_train)
y_hat=rrforest.predict(X_test)
mse_rrf=np.mean((y_test-y_hat)**2)
print("MSE TEST RAND FOR",mse_rrf)

#SVR
clf2 = svm.SVR(C=64.0, epsilon=6*1e-5,gamma='scale')
clf2.fit(X_train_norm, y_train)
y_hat = clf2.predict(X_test_norm)
mse_svr=np.mean((y_test-y_hat)**2)
print("MSE TEST SVR",mse_svr)
print("MAPE TEST SVR",mape(y_test,y_hat))



In [None]:
# SVR CHOSEN
# FIT AND PREPARE FOR USE
# MY ESTIMATION FOR MSE IS 3376
clf2.fit(data_x,data_y)