# linear regression model applying for ASOS_alone.pickle

## load datset

In [1]:
# import some function we can use later
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import os
import sys
from IPython.display import display, Image
from six.moves import cPickle as pickle
%matplotlib inline

In [2]:
#load data first
pickle_file = '/home/nfs/mjmu/haiming/data/visibility/' +  'ASOS_alone.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset= save['train_dataset']
    validate_dataset = save['validate_dataset']
    test_dataset = save['test_dataset']
    test_old = save['test_dataset_evan']
    del save
    

In [3]:
train_time = train_dataset['time']
train_data = train_dataset['data']
train_label = train_dataset['label']
validate_time = validate_dataset['time']
validate_data = validate_dataset['data']
validate_label = validate_dataset['label']
test_time = test_dataset['time']
test_data = test_dataset['data']
test_label = test_dataset['label']
test_old_data = test_old['data']
test_old_label = test_old['label']

In [4]:
print(train_data.shape, train_label.shape)
print(validate_data.shape, validate_label.shape)
print(test_data.shape, test_label.shape)

(176987, 70) (176987,)
(25283, 70) (25283,)
(50569, 70) (50569,)


In [5]:
print(train_label[:10])

[  9.  10.  10.   9.   8.   9.   8.   7.   7.   6.]


## dataset normalize

In [6]:
#dataset normalize
mean = train_data.mean(axis = 0)
std = train_data.std(axis = 0)
print(mean.shape, std.shape)
train_data_n = (train_data - mean)/std
validate_data_n = (validate_data - mean)/std
test_data_n = (test_data - mean)/std

(70,) (70,)


In [7]:
def MAE(clf, n_data, label):
    return (abs((clf.predict(n_data) - label))).mean()

## simple linear model

In [8]:
from sklearn.grid_search import GridSearchCV 

In [9]:
#SGDRegressor
'''SGD has been applied to large-scal and sparse machine learning problems. could be use when training dataset is large
than 100,000'''
from sklearn import linear_model
param_grid = {'loss':['squared_loss', 'huber'],
             'alpha':[0.00001, 0.0001, 0.001, 0.01]}
gs = GridSearchCV(linear_model.SGDRegressor(), param_grid=param_grid)

gs.fit(train_data_n, train_label)

GridSearchCV(cv=None, error_score='raise',
       estimator=SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'loss': ['squared_loss', 'huber'], 'alpha': [1e-05, 0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [10]:
print(gs.best_score_)

0.792249673681


In [11]:
print("train MAE = %f" % (MAE(gs, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(gs, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(gs, test_data_n, test_label)))

train MAE = 0.536110
validate MAE = 0.477010
test MAE = 0.527936


In [12]:
#ridge regression
param_grid = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 1.5, 2.0 , 5.0, 10.0, 100.0, 1000.0]}
gs = GridSearchCV(linear_model.Ridge(), param_grid=param_grid)
gs.fit(train_data_n, train_label)
print(gs.best_params_)

{'alpha': 100.0}


In [13]:
print("train MAE = %f" % (MAE(gs, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(gs, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(gs, test_data_n, test_label)))

train MAE = 0.523842
validate MAE = 0.462181
test MAE = 0.513353


In [14]:
#linear regression
clf = linear_model.LinearRegression()
clf.fit(train_data_n, train_label)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
print("train MAE = %f" % (MAE(clf, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(clf, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(clf, test_data_n, test_label)))

train MAE = 0.523641
validate MAE = 0.462749
test MAE = 0.513083


In [19]:
#Lasso model
param_grid = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 1.5, 2.0 , 5.0, 10.0, 100.0, 1000.0],
             'max_iter':[100, 1000, 5000, 10000]}
gs = GridSearchCV(linear_model.Lasso(), param_grid=param_grid)
gs.fit(train_data_n, train_label)

GridSearchCV(cv=None, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 1.5, 2.0, 5.0, 10.0, 100.0, 1000.0], 'max_iter': [100, 1000, 5000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [20]:
print(gs.best_params_)
print("train MAE = %f" % (MAE(gs, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(gs, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(gs, test_data_n, test_label)))

{'alpha': 1e-05, 'max_iter': 10000}
train MAE = 0.524026
validate MAE = 0.462929
test MAE = 0.513531


In [None]:
#ElasticNet model
clf = linear_model.ElasticNet(alpha=0.1)
clf.fit(train_data_n, train_label)

In [None]:
print("train MAE = %f" % (MAE(clf, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(clf, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(clf, test_old_data_n, test_old_label)))

In [None]:
#Lars model
clf = linear_model.Lars()
clf.fit(train_data_n, train_label)

In [None]:
print("train MAE = %f" % (MAE(clf, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(clf, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(clf, test_old_data_n, test_old_label)))

In [None]:
#LassoLars model
clf = linear_model.LassoLars(alpha = 0.1)
clf.fit(train_data_n, train_label)

In [None]:
print("train MAE = %f" % (MAE(clf, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(clf, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(clf, test_old_data_n, test_old_label)))

In [None]:
#OrthogonalMatchingPursuit
clf = linear_model.OrthogonalMatchingPursuit()
clf.fit(train_data_n, train_label)

In [None]:
print("train MAE = %f" % (MAE(clf, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(clf, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(clf, test_old_data_n, test_old_label)))

In [None]:
#BayesianRidge
clf = linear_model.BayesianRidge()
clf.fit(train_data_n, train_label)

In [None]:
print("train MAE = %f" % (MAE(clf, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(clf, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(clf, test_old_data_n, test_old_label)))

In [None]:
#PassiveAggressiveRegressor
clf = linear_model.PassiveAggressiveRegressor()
clf.fit(train_data_n, train_label)

In [None]:
print("train MAE = %f" % (MAE(clf, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(clf, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(clf, test_old_data_n, test_old_label)))

In [None]:
#TheilSenRegressor
#clf = linear_model.TheilSenRegressor()
#clf.fit(train_data_n, train_label)

In [None]:
#print("train MAE = %f" % (MAE(clf, train_data_n, train_label)))
#print("validate MAE = %f" % (MAE(clf, validate_data_n, validate_label)))

In [None]:
#try polynomial regresion:
from sklearn.preprocessing import PolynomialFeatures
ploy = PolynomialFeatures(degree = 2)
poly_data = ploy.fit_transform(train_data)

In [None]:
poly_data.shape

## nonlinear model

In [None]:
from sklearn.preprocessing import scale
poly_data_n = (poly_data - poly_data.mean(axis = 0)) / poly_data.std(axis = 0)
print(poly_data_n.shape)

In [None]:
clf = linear_model.SGDRegressor()
clf.fit(poly_data_n, train_label)

In [None]:
print("train MAE = %f" % (MAE(clf, poly_data_n, train_label)))

The result is very bad if we add more degree of freedom

In [21]:
#decision tree
from sklearn.tree import DecisionTreeRegressor
param_grid = {'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9 , 10, 11, 12, 13]
             }
gs = GridSearchCV(DecisionTreeRegressor(), param_grid=param_grid)
gs.fit(train_data_n, train_label)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [23]:
print(gs.best_params_)
print("train MAE = %f" % (MAE(gs, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(gs, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(gs, test_data_n, test_label)))

{'max_depth': 6}
train MAE = 0.455459
validate MAE = 0.405424
test MAE = 0.423601


In [None]:
train_MAE_list = []
validata_MAE_list = []
test_old_MAE_list = []
for max_d in range(2, 11):
    regr_tree = DecisionTreeRegressor(max_depth=max_d)
    regr_tree.fit(train_data_n, train_label)
    print("max_depth = %i" % (max_d))
    #print("train MAE = %f" % (MAE(regr_tree, train_data_n, train_label)))
    #print("validate MAE = %f" % (MAE(regr_tree, validate_data_n, validate_label)))
    #print("test MAE = %f" % (MAE(regr_tree, test_old_data_n, test_old_label)))
    train_MAE_list.append(MAE(regr_tree, train_data_n, train_label))
    validata_MAE_list.append(MAE(regr_tree, validate_data_n, validate_label))
    test_old_MAE_list.append(MAE(regr_tree, test_old_data_n, test_old_label))

In [None]:
plt.plot(train_MAE_list, marker='o', label='train MAE')
plt.plot(validata_MAE_list, marker='o', label = 'validate MAE')
plt.plot(test_old_MAE_list, marker='o', label = 'test_old MAE')
plt.legend()

In [None]:
from sklearn import tree
from sklearn.externals.six import StringIO
import pydot


In [None]:
with open("tree.dot", 'w') as f:
    f = tree.export_graphviz(regr_tree, out_file=f)


In [None]:
#decision tree
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=4)
regr.fit(train_data_n, train_label)

In [None]:
print("train MAE = %f" % (MAE(regr, train_data_n, train_label)))
print("validate MAE = %f" % (MAE(regr, validate_data_n, validate_label)))
print("test MAE = %f" % (MAE(regr, test_old_data_n, test_old_label)))

In [None]:
train_MAE_list = []
validata_MAE_list = []
test_old_MAE_list = []
for max_d in range(2, 21):
    regr = RandomForestRegressor(max_depth=max_d)
    regr.fit(train_data_n, train_label)
    print("max_depth = %i" % (max_d))
    #print("train MAE = %f" % (MAE(regr_tree, train_data_n, train_label)))
    #print("validate MAE = %f" % (MAE(regr_tree, validate_data_n, validate_label)))
    #print("test MAE = %f" % (MAE(regr_tree, test_old_data_n, test_old_label)))
    train_MAE_list.append(MAE(regr, train_data_n, train_label))
    validata_MAE_list.append(MAE(regr, validate_data_n, validate_label))
    test_old_MAE_list.append(MAE(regr, test_old_data_n, test_old_label))

In [None]:
plt.figure(figsize = (15, 10))
#plt.plot(train_MAE_list, marker='o', label='train MAE')
plt.plot(validata_MAE_list, marker='o', label = 'validate MAE')
plt.plot(test_old_MAE_list, marker='o', label = 'test_old MAE')
plt.legend()