# Linear Regression Extensions - :


In [31]:
import numpy as np
import sklearn
from sklearn.datasets import load_boston
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold

In [32]:
boston_data = load_boston()
X = boston_data.data
y = boston_data.target
y = y.reshape(X.shape[0], 1)


#normalize data
X = X / (np.linalg.norm(X,axis = 0))

In [33]:
def coefficient_linear_regression(X,Y):
    w = np.dot(np.linalg.pinv(X),Y)
    return w

# Implementing Lasso Regression  using coordinate descent algorithm :

In [34]:
# return w(j) for each w :

def coefficient(c,alpha):
    if c < (-alpha):
        wj = (c + alpha)
    elif c > alpha :
        wj = (c - alpha)
    else:
        wj = 0
    return wj


def lasso_coefficient(X,y,w,alpha,num_of_iterations):
    training_rows,features = X.shape
    for i in range(num_of_iterations):
        for j in range(features):
            x_j = X[:,j].reshape(-1,1)
            predicted_y = np.dot(X,w)
            c_j = x_j.T @ (y - predicted_y  + w[j]*x_j)
            w[j]=coefficient(c_j,alpha)
            if np.isinf(w[j]):
                w[j]=0
            else:
                w[j]= float(w[j])
    return w    
 


In [35]:
def train(X,y):
    m,n = X.shape
    k=5
    num_of_iterations=500
    alpha = np.logspace(0,4,300)/10
    lasso_costs=[]
    for al in alpha:
        train_error,test_error = cross_validation(X,y,k,al,num_of_iterations)
        lasso_costs = np.append(lasso_costs,test_error)
    
    min_arg = np.argmin(np.unique(X,return_counts=True)[1])
    min_test_error = np.argmin(lasso_costs)
    index=np.unravel_index(min_arg,lasso_costs.shape)
    #print("index with minimum value:",index)
    
    #best alpha =
    min_error_alpha = alpha[index]
    print("best alpha is:",min_error_alpha)
    return min_error_alpha
    

def predict(X,w):
    predicted=np.dot(X,w)
    return predicted


def evaluate_accuracy(y,predicted_y):
    test_error=np.sum((predicted_y - y)**2) / (y.shape[0])
    return test_error


def cross_validation(X,y,k,alpha,num_of_iterations):
    kf = KFold(n_splits=k, random_state=21, shuffle=True)
    total_training_error=0
    total_test_error = 0
    for i,j in kf.split(X):
        training_error=0
        test_error =0
        X_train, X_test = X[i],X[j]
        Y_train, Y_test = y[i],y[j]
        
        Y_train_mean = np.mean(Y_train)
        Y_train = Y_train - Y_train_mean
        Y_test = Y_test - Y_train_mean
        
        m,n = X_train.shape
        w = np.ones((n,1))
        w = lasso_coefficient(X_train,Y_train,w,alpha,num_of_iterations) 
        training_error = evaluate_accuracy(Y_train,predict(X_train,w))
        test_error = evaluate_accuracy(Y_test,predict(X_test,w))
        
        total_training_error +=training_error
        total_test_error += test_error
    
        
    average_training_error = total_training_error/k
    average_test_error = total_test_error/k
    return average_training_error,average_test_error
    
    

In [36]:
#lasso regularization on linear regression 

optimal_alpha = train(X,y)
train_error,test_error = cross_validation(X,y,5,optimal_alpha,500)
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size = 0.3, random_state =31)
m,n = X_train.shape
w = np.ones((n,1))
#w = np.dot(np.linalg.pinv(X_train),Y_train)
w_lasso =lasso_coefficient(X_train,Y_train,w,optimal_alpha,500) 
predicted_y = predict(X_test,w_lasso)
print("Mean squared error:",evaluate_accuracy(Y_test,predicted_y)) 
print("average train error is:",train_error)
print("average test error:",test_error)


best alpha is: 0.10312831606721848
Mean squared error: 51.368725995993216
average train error is: 42.96452063067896
average test error: 44.90887397941972


In [37]:
# using simple linear regression : 

boston_data = load_boston()
X_linear = boston_data.data
y_linear = boston_data.target
y_linear = y_linear.reshape(X.shape[0], 1)
X_train_linear,X_test_linear,Y_train_linear,Y_test_linear = train_test_split(X_linear,y_linear,test_size = 0.3, random_state = 31)
w_linear = coefficient_linear_regression(X_train_linear,Y_train_linear)
predicted_linear = predict(X_test_linear,w_linear)
print("linear regression mean squared error:",evaluate_accuracy(Y_test_linear,predicted_linear))
print("mean square error for linear regression on dataset1 is:",metrics.mean_squared_error(Y_test_linear,predicted_linear))

linear regression mean squared error: 24.03313968286438
mean square error for linear regression on dataset1 is: 24.03313968286438


In [38]:
#using sklearn lasso regularization
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_linear,Y_train_linear)
train_score=lasso.score(X_train_linear,Y_train_linear)
test_score=lasso.score(X_test_linear,Y_test_linear)
coeff_used = np.sum(lasso.coef_!=0)
predict_l=lasso.predict(X_test_linear)
print("mean square error is:",metrics.mean_squared_error(Y_test_linear,predict_l))
print("training score:", train_score) 
print("test score: ", test_score)
print("number of features used: ", coeff_used)

mean square error is: 21.349196618370883
training score: 0.7131258331595984
test score:  0.7404464175157098
number of features used:  12


# Dataset 2 : Diabetes dataset

In [39]:
diabetes_data = load_diabetes()
X_new = diabetes_data.data
y_new = diabetes_data.target
y_new = y_new.reshape(X_new.shape[0], 1)




#normalize data
X_new = X_new / (np.linalg.norm(X_new,axis = 0))



In [40]:
#running lasso implementation
#print("X_new:",X_new.shape)
#print("Y_new:",y_new.shape)
optimal_alpha = train(X_new,y_new)
train_error_new,test_error_new = cross_validation(X_new,y_new,5,optimal_alpha,500)
X_train_new,X_test_new,Y_train_new,Y_test_new = train_test_split(X_new,y_new,test_size = 0.3, random_state =31)
m,n = X_train_new.shape
w_new = np.ones((n,1))
w_lasso_new =lasso_coefficient(X_train_new,Y_train_new,w_new,optimal_alpha,500) 
#print(w_lasso)
predicted_y_new = predict(X_test_new,w_lasso_new)
print("Mean squared error:",evaluate_accuracy(Y_test_new,predicted_y_new)) 
print("average train error is:",train_error)
print("average test error:",test_error)
print("mean square error using sklearn:",metrics.mean_squared_error(Y_test_new,predicted_y_new))

best alpha is: 0.1
Mean squared error: 27821.895483517885
average train error is: 42.96452063067896
average test error: 44.90887397941972
mean square error using sklearn: 27821.895483517885


In [41]:
#running linear regression
diabetes_data = load_diabetes()
X_new_linear = diabetes_data.data
y_new_linear = diabetes_data.target
y_new_linear = y_new_linear.reshape(X_new_linear.shape[0], 1)

#splitting dataset 
X_train_new_linear,X_test_new_linear,Y_train_new_linear,Y_test_new_linear = train_test_split(X_new_linear,y_new_linear,test_size = 0.3, random_state = 31)
w_linear_new = coefficient_linear_regression(X_train_new_linear,Y_train_new_linear)
predicted_linear_new = predict(X_test_new_linear,w_linear_new)
print("linear regression mean squared error:",evaluate_accuracy(Y_test_new,predicted_linear_new))
print("mean square error is:",metrics.mean_squared_error(Y_test_new,predicted_linear_new))




linear regression mean squared error: 28722.37086585841
mean square error is: 28722.37086585841


In [42]:
#running sklearn lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_new_linear,Y_train_new_linear)
train_score=lasso.score(X_train_new_linear,Y_train_new_linear)
test_score=lasso.score(X_test_new_linear,Y_test_new_linear)
coeff_used = np.sum(lasso.coef_!=0)
predict=lasso.predict(X_test_new_linear)
print("training score:", train_score) 
print("test score: ", test_score)
print("number of features used: ", coeff_used)
print("mean square error is:",metrics.mean_squared_error(Y_test_new_linear,predict))


training score: 0.47413308018390576
test score:  0.5665238529385686
number of features used:  7
mean square error is: 2890.233246630248
