In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
#%pwd
#%cd C:\Users\shami\Documents\Priya NIIT\April Engagement\Session 6
df = pd.read_csv('house_data.csv').dropna()#.drop('lat', 'long', axis = 1)
# df.info()
df.describe()
#dummies = pd.get_dummies(df[['floors', 'condition']])

FileNotFoundError: [Errno 2] No such file or directory: 'house_data.csv'

In [None]:
y = df.price

# Drop the column with the independent variable (Price), and latitude and longitude
#X_ = df.drop(['price', 'lat', 'long'], axis = 1).astype('float64')
X = df.drop(['price', 'lat', 'long'], axis = 1).astype('float64')

# Define the feature set X.
#X = pd.concat([X_, dummies[['floors_N', 'condition_W']]], axis = 1)

X.info()

In [None]:
#The Ridge() function has an alpha argument ( λ , but with a different name!) that is used to tune the model. 
#We'll generate an array of alpha values ranging from very big to very small, 
#essentially covering the full range of scenarios from the null model containing only the intercept, to the least squares fit:
alphas = 10**np.linspace(10,-2,100)*0.5
alphas

In [None]:
#Associated with each alpha value is a vector of ridge regression coefficients, which we'll store in a matrix coefs. 
#In this case, it is a  19×100  matrix, with 19 rows (one for each predictor) and 100 columns (one for each value of alpha). 
#Remember that we'll want to standardize the variables so that they are on the same scale. 
#To do this, we can use the normalize = True parameter:
ridge = Ridge(normalize = True)
coefs = []

for a in alphas:
    ridge.set_params(alpha = a)
    ridge.fit(X, y)
    coefs.append(ridge.coef_)
    
np.shape(coefs)

In [None]:
#We expect the coefficient estimates to be much smaller, in terms of  l2  norm, when a large value of alpha is used, 
# as compared to when a small value of alpha is used. Let's plot and find out:
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')

In [None]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# Next we fit a ridge regression model on the training set, and evaluate its MSE on the test set, using λ=1
ridge2 = Ridge(alpha = 1, normalize = True)
ridge2.fit(X_train, y_train)             # Fit a ridge regression on the training data
pred2 = ridge2.predict(X_test)           # Use this model to predict the test data
print(pd.Series(ridge2.coef_, index = X.columns)) # Print coefficients
print(mean_squared_error(y_test, pred2))          # Calculate the test MSE
print(r2_score(y_test, pred2))          # Calculate the test r-squared

In [None]:
#The test MSE when alpha = 4 is 106216. Now let's see what happens if we use a huge value of alpha, say  10**10
ridge3 = Ridge(alpha = 10**10, normalize = True)
ridge3.fit(X_train, y_train)             # Fit a ridge regression on the training data
pred3 = ridge3.predict(X_test)           # Use this model to predict the test data
print(pd.Series(ridge3.coef_, index = X.columns)) # Print coefficients
print(mean_squared_error(y_test, pred3))          # Calculate the test MSE
print(r2_score(y_test, pred3))          # Calculate the test r-squared

In [None]:
#We now check whether there is any benefit to performing ridge regression with alpha = 4 instead of 
#just performing least squares regression. 
#Recall that least squares is simply ridge regression with alpha = 0.
ridge2 = Ridge(alpha = 0, normalize = True)
ridge2.fit(X_train, y_train)             # Fit a ridge regression on the training data
pred = ridge2.predict(X_test)            # Use this model to predict the test data
print(pd.Series(ridge2.coef_, index = X.columns)) # Print coefficients
print(mean_squared_error(y_test, pred))           # Calculate the test MSE
print(r2_score(y_test, pred))          # Calculate the test r-squared

In [None]:
#Instead of arbitrarily choosing alpha  =4 , it would be better to use cross-validation to choose the tuning parameter alpha. 
#We can do this using the cross-validated ridge regression function, RidgeCV(). 
#By default, the function performs generalized cross-validation (an efficient form of LOOCV), 
#though this can be changed using the argument cv.
ridgecv = RidgeCV(alphas = alphas, scoring = 'neg_mean_squared_error', normalize = True)
ridgecv.fit(X_train, y_train)
ridgecv.alpha_

In [None]:
#Therefore, we see that the value of alpha that results in the smallest cross-validation error is 0.005. 
#What is the test MSE associated with this value of alpha?
ridge4 = Ridge(alpha = ridgecv.alpha_, normalize = True)
ridge4.fit(X_train, y_train)
mean_squared_error(y_test, ridge4.predict(X_test))
r2_score(y_test, ridge4.predict(X_test))

In [None]:
ridge4.fit(X, y)
pd.Series(ridge4.coef_, index = X.columns)

In [None]:
#In order to fit a lasso model, we'll use the Lasso() function; however, this time we'll need to include the argument max_iter = 10000. 
#Other than that change, we proceed just as we did in fitting a ridge model:
lasso = Lasso(max_iter = 10000, normalize = True)
coefs = []

for a in alphas:
    lasso.set_params(alpha=a)
    lasso.fit(scale(X_train), y_train)
    coefs.append(lasso.coef_)
    
ax = plt.gca()
ax.plot(alphas*2, coefs)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')

In [None]:
#Notice that in the coefficient plot that depending on the choice of tuning parameter, some of the coefficients are 
#exactly equal to zero. We now perform 10-fold cross-validation to choose the best alpha, refit the model, and 
#compute the associated test error:
lassocv = LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)
lassocv.fit(X_train, y_train)

lasso.set_params(alpha=lassocv.alpha_)
lasso.fit(X_train, y_train)
mean_squared_error(y_test, lasso.predict(X_test))
r2_score(y_test, lasso.predict(X_test))


In [None]:
print(pd.Series(lasso.coef_, index = X.columns)) # Print coefficients

In [None]:
lassocv.alpha_