In [1]:
import pandas as pd

from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()

df = pd.DataFrame(california.data, columns = california.feature_names)
df['Value'] = california.target

features = df.drop('Value', axis = 1)
target = df['Value']

In [11]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 3000)

model = Ridge(alpha = 100).fit(X = X_train, y = y_train)

print('R-squared value for training set: ', r2_score(y_train, model.predict(X_train)))
print('R-squared value for testing set: ', r2_score(y_test, model.predict(X_test)))

R-squared value for training set:  0.6090669500850712
R-squared value for testing set:  0.5970353837773708


In [12]:
for i, name in enumerate(california.feature_names):
    print(f'{name : >10}: {model.coef_[i]}')

    MedInc: 0.43384961131176747
  HouseAge: 0.00978641287563271
  AveRooms: -0.09734535567862976
 AveBedrms: 0.5884811557761856
Population: -6.216730356535916e-06
  AveOccup: -0.0038505220101157467
  Latitude: -0.4172931846834424
 Longitude: -0.42620059872132976


In [3]:
'''
LASSO REGRESSION
- An alternative to Ridge for regularizing linear regression
- This model restricts coefficeints to reduce model complexity

L2 Regression
- In L2 regeression the penalty is applied to the sum of the coefficients squared (Ridge uses this)

L1 Regression
- Type of regularization minimizes the sum of the absolute values of the coefficinets (Lasso uses this)

Higher Alpha Values
- More regularization and more restricted coeffients
- Some coeffients will be exactly 0
- Simpler model
- Easier to interpret
'''

'\nLASSO REGRESSION\n- An alternative to Ridge for regularizing linear regression\n- This model restricts coefficeints to reduce model complexity\n\nL2 Regression\n- In L2 regeression the penalty is applied to the sum of the coefficients squared (Ridge uses this)\n\nL1 Regression\n- Type of regularization minimizes the sum of the absolute values of the coefficinets (Lasso uses this)\n\nHigher Alpha Values\n- More regularization and more restricted coeffients\n- Some coeffients will be exactly 0\n- Simpler model\n- Easier to interpret\n'

In [4]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 3000)

model = Lasso(alpha = 0.001).fit(X = X_train, y = y_train)

print('R-squared value for training set: ', r2_score(y_train, model.predict(X_train)))
print('R-squared value for testing set: ', r2_score(y_test, model.predict(X_test)))

R-squared value for training set:  0.6094736555118498
R-squared value for testing set:  0.596072604394458


In [5]:
model.coef_

array([ 4.42860957e-01,  9.65794952e-03, -1.14393659e-01,  6.84914631e-01,
       -6.61544631e-06, -3.86301445e-03, -4.18713354e-01, -4.28877681e-01])

In [6]:
for i, name in enumerate(california.feature_names):
    print(f'{name : >10}: {model.coef_[i]}')

    MedInc: 0.44286095735147735
  HouseAge: 0.009657949516225154
  AveRooms: -0.11439365870528742
 AveBedrms: 0.6849146313016123
Population: -6.61544631118968e-06
  AveOccup: -0.003863014448215848
  Latitude: -0.4187133539756105
 Longitude: -0.4288776805653922


In [7]:
california.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [8]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import numpy as np

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 3000)

model = Lasso(alpha = 0.001).fit(X = X_train, y = y_train)

print('R-squared value for training set: ', r2_score(y_train, model.predict(X_train)))
print('R-squared value for testing set: ', r2_score(y_test, model.predict(X_test)))
print('Number of features used:', np.sum(model.coef_ != 0))

R-squared value for training set:  0.6094736555118498
R-squared value for testing set:  0.596072604394458
Number of features used: 8
