In [25]:
import json
import numpy as np
import pandas as pd
import numpy.linalg as alg

x = pd.read_csv('winequality-white.csv',sep=';')
x.head(11)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.2,0.4,0.62,10.8,0.041,70.0,189.0,0.9976,3.08,0.49,8.6,4
1,7.2,0.28,0.54,16.7,0.045,54.0,200.0,0.999,3.08,0.49,9.5,6
2,6.8,0.19,0.58,14.2,0.038,51.0,164.0,0.9975,3.12,0.48,9.6,6
3,6.4,0.3,0.3,2.25,0.038,8.0,210.0,0.9937,3.2,0.62,9.9,6
4,6.5,0.3,0.29,2.25,0.037,8.0,210.0,0.9937,3.19,0.62,9.9,5
5,7.8,0.18,0.31,12.2,0.053,46.0,140.0,0.998,3.06,0.53,8.9,6
6,7.8,0.18,0.31,12.2,0.053,46.0,140.0,0.998,3.06,0.53,8.9,6
7,7.3,0.51,0.26,3.3,0.09,7.0,135.0,0.9944,3.01,0.52,8.8,5
8,6.0,0.24,0.27,1.9,0.048,40.0,170.0,0.9938,3.64,0.54,10.0,7
9,5.9,0.62,0.28,3.5,0.039,55.0,152.0,0.9907,3.44,0.44,12.0,6


In [26]:
def data_processing_linear_regression(filename, non_invertible, mapping, mapping_power):
    white = pd.read_csv(filename, low_memory=False, sep=';').values
    [N, d] = white.shape
    if(mapping == True):
        maped_X = mapping_data(white[:,:-1],mapping_power)
        white = np.insert(maped_X, maped_X.shape[1], white[:,-1], axis=1)
    np.random.seed(3)
    
    # prepare data
    ridx = np.random.permutation(N)
    ntr = int(np.round(N * 0.8))
    nval = int(np.round(N * 0.1))
    ntest = N - ntr - nval
    
    # spliting training, validation, and test
    Xtrain = np.hstack([np.ones([ntr, 1]), white[ridx[0:ntr], 0:-1]])
    ytrain = white[ridx[0:ntr], -1]
    Xval = np.hstack([np.ones([nval, 1]), white[ridx[ntr:ntr + nval], 0:-1]])
    yval = white[ridx[ntr:ntr + nval], -1]
    Xtest = np.hstack([np.ones([ntest, 1]), white[ridx[ntr + nval:], 0:-1]])
    ytest = white[ridx[ntr + nval:], -1]
    if(non_invertible == True):
        N, D = Xtrain.shape
        np.random.seed(4)
        random_row = np.random.randint(N)
        random_col = np.random.randint(D)
        Xtrain[:,random_col] = 0
        Xtrain[random_row,:] = 0
    return Xtrain, ytrain, Xval, yval, Xtest, ytest

In [27]:
def mean_absolute_error(w, X, y):
    ma_error = None
    pred = X.dot(w)
    ma_error = (abs(pred - y)).mean()
    return ma_error

def linear_regression_noreg(X, y):
    weight = None
    assert isinstance(X, np.ndarray) and isinstance(y, np.ndarray)
    weight = alg.inv(X.T.dot(X)).dot(X.T).dot(y)
    return weight

def linear_regression_invertible(X, y):
    weight = None
    mtx = X.T.dot(X)
    temp = 0.1*np.identity(mtx.shape[0])
    eig_val, eig_vec = alg.eig(mtx)
    while np.abs(eig_val).min() < 0.00001:
        mtx += temp
        eig_val, eig_vec = alg.eig(mtx)
    weight = alg.inv(mtx).dot(X.T).dot(y)
    return weight

def regularized_linear_regression(X, y, lambd):
    weight = None
    mtx = X.T.dot(X)
    temp = lambd * np.identity(mtx.shape[0])
    mtx += temp
    weight = alg.inv(mtx).dot(X.T).dot(y)
    return weight

def tune_lambda(Xtrain, ytrain, Xval, yval):
    best_lambda = None
    best_abs_error = 99999.9
    for k in range(-19, 20):
        lamb = 10**k
        weight = regularized_linear_regression(Xtrain, ytrain, lamb)
        abs_error = mean_absolute_error(weight, Xval, yval)
        if abs_error < best_abs_error:
            best_lambda = lamb
            best_abs_error = abs_error
    return best_lambda

def mapping_data(X, power):
    pow_x = [X]
    for k in range(1, power):
        pow_x.append(pow_x[k-1]*X)
    return np.concatenate(pow_x, axis=1)


In [32]:
filename = 'winequality-white.csv'

Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing_linear_regression(filename, False, False, 0)
w = linear_regression_noreg(Xtrain, ytrain)
print("Dimensionality of the model parameter is ", w.shape, ".", sep="")
print("Model parameter is ", np.array_str(w))
mae = mean_absolute_error(w, Xtrain, ytrain)
print("MAE on train is ----> %.5f" % mae)
mae = mean_absolute_error(w, Xval, yval)
print("MAE on val is ----> %.5f" % mae)
mae = mean_absolute_error(w, Xtest, ytest)
print("MAE on test is ----> %.5f" % mae)

print('*********************************************************************************************************************')
Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing_linear_regression(filename, True, False, 0)
w = linear_regression_invertible(Xtrain, ytrain)
print("\nDimensionality of the model parameter is ", w.shape, ".", sep="")
print("Model parameter is ", np.array_str(w))
mae = mean_absolute_error(w, Xtrain, ytrain)
print("MAE on train is ----> %.5f" % mae)
mae = mean_absolute_error(w, Xval, yval)
print("MAE on val is ----> %.5f" % mae)
mae = mean_absolute_error(w, Xtest, ytest)
print("MAE on test is ----> %.5f" % mae)

print('*********************************************************************************************************************')
Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing_linear_regression(filename, True, False, 0)
w = regularized_linear_regression(Xtrain, ytrain, 0.1)
print("\nDimensionality of the model parameter is ", w.shape, ".", sep="")
print("Model parameter is ", np.array_str(w))
mae = mean_absolute_error(w, Xtrain, ytrain)
print("MAE on train is ----> %.5f" % mae)
mae = mean_absolute_error(w, Xval, yval)
print("MAE on val is ----> %.5f" % mae)
mae = mean_absolute_error(w, Xtest, ytest)
print("MAE on test is ----> %.5f" % mae)

print('*********************************************************************************************************************')
Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing_linear_regression(filename, False, False, 0)
bestlambd = tune_lambda(Xtrain, ytrain, Xval, yval)
print("\nBest Lambda ====>  " + str(bestlambd))
w = regularized_linear_regression(Xtrain, ytrain, bestlambd)
print("Ddimensionality of the model parameter is ", len(w), ".", sep="")
print("Model parameter is ", np.array_str(w))
mae = mean_absolute_error(w, Xtrain, ytrain)
print("MAE on train is ----> %.5f" % mae)
mae = mean_absolute_error(w, Xval, yval)
print("MAE on val is ----> %.5f" % mae)
mae = mean_absolute_error(w, Xtest, ytest)
print("MAE on test is ----> %.5f" % mae)

print('*********************************************************************************************************************')
power = 2
Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing_linear_regression(filename, False, True, power)
bestlambd = tune_lambda(Xtrain, ytrain, Xval, yval)
print("\nBest Lambda ====>  ", bestlambd, sep="")
w = regularized_linear_regression(Xtrain, ytrain, bestlambd)
print("Dimensionality of the model parameter is ", len(w), ".", sep="")
print("Dodel parameter is ", np.array_str(w))
mae = mean_absolute_error(w, Xtrain, ytrain)
print("MAE on train is ----> %.5f" % mae)
mae = mean_absolute_error(w, Xval, yval)
print("MAE on val is ----> %.5f" % mae)
mae = mean_absolute_error(w, Xtest, ytest)
print("MAE on test is ----> %.5f" % mae)

print('*********************************************************************************************************************')
power = 20
for i in range(2, power):
    Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing_linear_regression(filename, False, True, i)
    bestlambd = tune_lambda(Xtrain, ytrain, Xval, yval)
    print('\n\nBest lambda is ' + str(bestlambd))
    w = regularized_linear_regression(Xtrain, ytrain, bestlambd)
    mae = mean_absolute_error(w, Xtrain, ytrain)
    print('when power = ' + str(i))
    print("MAE on train is ----> %.5f" % mae)
    mae = mean_absolute_error(w, Xval, yval)
    print("MAE on val is ----> %.5f" % mae)
    mae = mean_absolute_error(w, Xtest, ytest)
    print("MAE on test is ----> %.5f" % mae)


Dimensionality of the model parameter is (12,).
Model parameter is  [ 2.03721116e+02  1.09955585e-01 -1.93164831e+00 -4.90845229e-02
  1.02194195e-01 -5.45232536e-02  4.00189587e-03  1.52537226e-04
 -2.04673020e+02  9.04608185e-01  6.41578531e-01  1.32320100e-01]
MAE on train is ----> 0.58020
MAE on val is ----> 0.59410
MAE on test is ----> 0.56079
*********************************************************************************************************************

Dimensionality of the model parameter is (12,).
Model parameter is  [ 1.72490981 -0.04898016 -2.05742338 -0.12068529  0.02838304 -0.81308859
  0.00425276  0.          0.00675103  0.20653332  0.35756603  0.37653797]
MAE on train is ----> 0.58509
MAE on val is ----> 0.59939
MAE on test is ----> 0.55777
*********************************************************************************************************************

Dimensionality of the model parameter is (12,).
Model parameter is  [ 1.72490981 -0.04898016 -2.05742338 -0.1