In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

In [2]:
california_housing = fetch_california_housing(as_frame=True)
california_housing

{'data':        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
 0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
 1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
 2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
 3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
 4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
 ...       ...       ...       ...        ...         ...       ...       ...   
 20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
 20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
 20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
 20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
 20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   
 
        Longitude 

In [5]:
california_housing.data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [6]:
X = california_housing.data
X.shape

(20640, 8)

In [7]:
y = california_housing.target
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: MedHouseVal, Length: 20640, dtype: float64

In [8]:
pd.concat([X,y], axis=1).corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
MedHouseVal,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


In [9]:
california_housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [10]:
print(california_housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# Additional Step: Scaling the data  :STANDARD SCALER

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [13]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,r2_score
lr = LinearRegression()
lr.fit(X_train,y_train)

y_pred_train = lr.predict(X_train)
print(f"Train Score: {r2_score(y_train,y_pred_train)}")

y_pred_test = lr.predict(X_test)
print(f"Test Score: {r2_score(y_test,y_pred_test)}")

Train Score: 0.6088968118672871
Test Score: 0.5951292134599004


In [16]:
# Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# MODEL
ridge = Ridge()
# MODEL TRAIN
ridge.fit(X_train,y_train)
# MODEL pREDICTION
y_train_pred  = ridge.predict(X_train)
print(f"Ridge Score on train: {r2_score(y_train,y_train_pred)})")

y_pred_test = ridge.predict(X_test)
print(f"Ridge Score on train: {r2_score(y_test,y_pred_test)})")

Ridge Score on train: 0.6088967440330115)
Ridge Score on train: 0.5951233971995414)


# Grid Search 

In [17]:
from sklearn.model_selection import GridSearchCV
# list of hyperparameters
param_grid = {'alpha':[.0001,0.001,0.01,0.1,1,10,100,1000]}

# model rige with gridsearch
ridge_cv = GridSearchCV(ridge,param_grid=param_grid, cv=2)

# model train
ridge_cv.fit(X_train,y_train)

# model predict on training data
y_pred_train =  ridge_cv.predict(X_train)
# model error on training data
print(mean_absolute_error(y_train,y_pred_train))

# model error on training data
print(r2_score(y_train,y_pred_train))


# model best hyper parameter on training data
print(ridge_cv.best_estimator_)

0.530829245146849
0.6088901332969191
Ridge(alpha=10)


In [18]:
y_pred_test =  ridge_cv.predict(X_test)
# model error on training data
print(mean_absolute_error(y_test,y_pred_test))
# model error on training data
print(mean_squared_error(y_test,y_pred_test))
# model performance oin train data 
print(r2_score(y_test,y_pred_test))
# model best hyper parameter on training data
print(ridge_cv.best_estimator_)

0.5353631944726613
0.5280170060381147
0.5950649787804927
Ridge(alpha=10)


In [19]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
lasso = Lasso()
lasso.fit(X_train,y_train)
y_train_pred  = lasso.predict(X_train)
print(mean_absolute_error(y_train,y_train_pred))
print(mean_squared_error(y_train,y_train_pred))
print(r2_score(y_train, y_train_pred))

0.9137765045246339
1.3383715004003633
0.0


In [20]:
y_pred_test =  lasso.predict(X_test)
# model error on training data
print(mean_absolute_error(y_test,y_pred_test))
# model error on training data
print(mean_squared_error(y_test,y_pred_test))
# model performance oin train data 
print(r2_score(y_test,y_pred_test))
# model best hyper parameter on training data


0.9071315345587282
1.3043431479307854
-0.00029774466534027155


# Applying GirdSearch and Corss Validation for best alpha value

In [21]:
param_grid = {'alpha':[0.000000000001,.0001,0.01,0.1,1,10,100,1000]
             }

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
lasso_cv = GridSearchCV(lasso,param_grid=param_grid, cv=3)

In [24]:
lasso_cv.fit(X_train,y_train)

In [25]:
y_pred_train =  lasso_cv.predict(X_train)

In [26]:
mean_absolute_error(y_train,y_pred_train)

0.5308848550062945

In [27]:
mean_squared_error(y_train,y_pred_train)

0.5234419276433152

In [28]:
r2_score(y_train,y_pred_train)

0.6088963882698252

In [29]:
lasso_cv.best_estimator_

In [30]:
y_pred_test =  lasso_cv.predict(X_test)

In [31]:
r2_score(y_test,y_pred_test)

0.5950955967919298

In [32]:
good_lasso = Lasso(alpha=0.0001)

In [33]:
good_lasso.fit(X_train,y_train)

In [34]:
r2_score(y_test,y_pred_test)

0.5950955967919298