In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model as lm
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_diabetes
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LassoCV


In [None]:
dataset = load_diabetes()
data_x = dataset.data
data_y = dataset.target
print("Number of nan's in dataset:",np.sum(np.isnan(data_x)))

In [None]:
y_train = data_y[:300,]
y_test = data_y[300:,]

X_train = data_x[:300,:(data_x.shape[1])]
X_test= data_x[300:,:(data_x.shape[1])]
# Normalized to [0,1]
scaler = MinMaxScaler().fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)


In [None]:
#OLS
# Train mse with cross-validation
from sklearn.model_selection import KFold
lr=lm.LinearRegression()
mse_train_ols =cross_val_score(lr, X_train, y_train, cv=10,scoring = 'neg_mean_squared_error')
print("Mean OLS Train MSE: %f (%f)" %(-np.mean(mse_train_ols),np.std(mse_train_ols)))

In [None]:
# Ridge tune alpha
clf = RidgeCV(cv=10,alphas=[1e-3, 1e-2, 1e-1,1e0, 1e1,1e2,1e3]).fit(X_train_norm, y_train)
print("Ridge alpha=",clf.alpha_)
# Ridge regression
ridge = lm.Ridge(alpha=clf.alpha_,normalize=False)
mse_train_ridge =cross_val_score(ridge, X_train_norm, y_train, cv=10,scoring = 'neg_mean_squared_error')
print("Mean Ridge Train MSE: %f (%f)" %(-np.mean(mse_train_ridge),np.std(mse_train_ridge)))

In [None]:
# Tune Lasso alpha
reg = LassoCV(alphas=[1e-3, 1e-2, 1e-1,1e0, 1e1,1e2,1e3],cv=5).fit(X_train_norm, y_train)
print("Alpha for Lasso:",reg.alpha_)
# Lasso regression
lasso=lm.Lasso(reg.alpha_)
lasso.fit(X_train_norm,y_train)
mse_train_lasso =cross_val_score(lasso, X_train_norm, y_train, cv=10,scoring = 'neg_mean_squared_error')
print("Mean Lasso Train MSE: %f (%f)" %(-np.mean(mse_train_lasso),np.std(mse_train_lasso)))

In [None]:
# Logistic Regression
logr = lm.LogisticRegression(penalty='l2' ,C =10.0, multi_class = 'multinomial', solver = 'lbfgs', max_iter = 10000,tol = 1e-3, verbose=1)
logr.fit(X_train_norm,y_train)

mse_train_logr =cross_val_score(logr, X_train_norm, y_train, cv=5,scoring = 'neg_mean_squared_error')
print("Mean Logistic Regression Train MSE: %f (%f)" %(-np.mean(mse_train_logr),np.std(mse_train_logr)))

In [None]:
# Tune SVR parameters
Cs = [2**(-3),2**(-2),2**(-1), 1,2**(1),2**(2),2**(3),2**(4),2**(5),2**(6),2**(7),2**(8),2**(9),2**(10),2**(11),2**(12),2**(13),2**(14),2**(15)]
eps = [2**(-15),2**(-14),2**(-13),2**(-12),2**(-11),2**(-10),2**(-9),2**(-8),2**(-7),2**(-6),2**(-5),2**(-4),2**(-3),2**(-2),2**(-1),2**(0),2**(1),2**(2),2**(3)]

cv=StratifiedKFold(n_splits=5).split(X_train_norm, y_train)

parameters = {'C':Cs,'epsilon':eps}
svc = svm.SVR( kernel='rbf',gamma='scale')
clf = GridSearchCV(svc, parameters, cv=cv,verbose=2,n_jobs=-1,scoring = 'neg_mean_squared_error')
clf.fit(X_train_norm, y_train)
print("The best parameters are %s with a score of %f"% (clf.best_params_, clf.best_score_))

In [None]:
# SVR
svr_model = svm.SVR(C=128.0, epsilon=8,gamma='scale')
svr_model.fit(X_train_norm, y_train)
mse_train_svm =cross_val_score(svr_model, X_train_norm, y_train, cv=10,scoring = 'neg_mean_squared_error')
print("Mean SVR Train MSE: %f (%f)",((-np.mean(mse_train_svm)),np.std(mse_train_svm)))

In [None]:
# DecisionTreeRegressor tune parameters
min_samples_split=[0.1*x for x in range(1,11)]
max_depth=[x for x in range(1,31)]
max_features=[x for x in range(1,11)]
min_samples_leaf=[0.1*x for x in range(1,6)]
parameters = {'max_depth':max_depth,'min_samples_split':min_samples_split,'max_features':max_features,'min_samples_leaf':min_samples_leaf}

rtree = tree.DecisionTreeRegressor(criterion='mse', splitter='best',                                    
                                   min_weight_fraction_leaf=0.0,
                                   random_state=0, max_leaf_nodes=None,
                                   min_impurity_decrease=0.0, min_impurity_split=None,
                                   presort=False)

clf = GridSearchCV(rtree, parameters, cv=10,scoring='neg_mean_squared_error',verbose=2,n_jobs=-1)
clf.fit(X_train, y_train)
print("The best parameters are %s with a score of %f" % (clf.best_params_, clf.best_score_))

In [None]:
# DecisionTreeRegressor
rtree = tree.DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=4,
                                   min_samples_split=0.1, min_samples_leaf=0.1,
                                   min_weight_fraction_leaf=0.0, max_features=9,
                                   random_state=0, max_leaf_nodes=None,
                                   min_impurity_decrease=0.0, min_impurity_split=None,
                                   presort=False)
rtree_scores = -cross_val_score(rtree, X_train, y_train, cv=5,
                                scoring='neg_mean_squared_error')

rtree.fit(X_train,y_train)

print("Regression trees on train, estimated MSE: %f (%f)" %(np.mean(rtree_scores), np.std(rtree_scores)))

#y_hat = rtree.predict(X_test)
#mse_rtree=np.mean((y_test-y_hat)**2)
#print("DecisionTreeRegressor TEST MSE:",mse_rtree)


In [None]:
# Random Forests tune parameters
min_samples_split=[0.1*x for x in range(1,11)]
max_depth=[x for x in range(1,31)]
max_features=[x for x in range(1,11)]
min_samples_leaf=[1]
n_estimators=[x for x in range(1,100,10)]
parameters = {'n_estimators':n_estimators,'max_depth':max_depth,'min_samples_split':min_samples_split,'max_features':max_features,'min_samples_leaf':min_samples_leaf}

rrforest = ensemble.RandomForestRegressor(criterion='mse',
                                          min_weight_fraction_leaf=0.0,
                                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                                          min_impurity_split=None, bootstrap=True, oob_score=True,
                                          n_jobs=1, random_state=0, verbose=0,
                                          warm_start=False)

clf = GridSearchCV(rrforest, parameters, cv=10,scoring='neg_mean_squared_error',verbose=2,n_jobs=-1)
clf.fit(X_train, y_train)
print("The best parameters are %s with a score of %f" % (clf.best_params_, clf.best_score_))

In [None]:
# Random Forests
#max_features = int(np.ceil(np.sqrt(X_train.shape[1]) / 3.0))
rrforest = ensemble.RandomForestRegressor(n_estimators=30, criterion='mse', max_depth=4,
                                          min_samples_split=0.1, min_samples_leaf=1,
                                          min_weight_fraction_leaf=0.0, max_features=6,
                                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                                          min_impurity_split=None, bootstrap=True, oob_score=False,
                                          n_jobs=1, random_state=None, verbose=0,
                                          warm_start=False)
rrforest_scores = -cross_val_score(rrforest, X_train, y_train, cv=5,
                                   scoring='neg_mean_squared_error')
print("Random forests on train, estimated MSE: %f (%f)" % (np.mean(rrforest_scores), np.std(rrforest_scores)))
rrforest.fit(X_train,y_train)
#y_hat = rrforest.predict(X_test)
#mse_rf=np.mean((y_test-y_hat)**2)
#print("Random Forests TEST MSE:",mse_rf)

In [None]:
# USE CHOSEN MODEL IN TEST
y_hat = svr_model.predict(X_test_norm)
mse_svr=np.mean((y_test-y_hat)**2)
print("SVR TEST MSE:",mse_svr)