# Load the dataset

In [2]:
from sklearn.datasets import fetch_california_housing

In [4]:
housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

# Conver into dataframe for better readability

In [6]:
import pandas as pd

In [10]:
x=housing.data
y=housing.target
df = pd.DataFrame(x,columns=housing.feature_names)
df['MedHouseVal']=y
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


# Split into training and test sets

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=123)

# Normalize the features using StandardScaler

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Train a Linear Regression Model


In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
lr_model = LinearRegression()
lr_model.fit(x_train_scaled,y_train)

# Review model coefficients to understand which features most affect the prediction

In [22]:
coefficients = lr_model.coef_

In [29]:
# Create a DataFrame to display feature names and their corresponding coefficients
X =df.iloc[:,:8]
feature_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': coefficients})
feature_importance

Unnamed: 0,Feature,Coefficient
0,MedInc,0.83601
1,HouseAge,0.115221
2,AveRooms,-0.281901
3,AveBedrms,0.31829
4,Population,-0.007404
5,AveOccup,-0.041683
6,Latitude,-0.89015
7,Longitude,-0.855549


In [31]:
#Sort the DataFrame by the absolute value of the coefficients to see the most influential features
feature_importance['Absolute Coefficient'] = abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)
feature_importance

Unnamed: 0,Feature,Coefficient,Absolute Coefficient
6,Latitude,-0.89015,0.89015
7,Longitude,-0.855549,0.855549
0,MedInc,0.83601,0.83601
3,AveBedrms,0.31829,0.31829
2,AveRooms,-0.281901,0.281901
1,HouseAge,0.115221,0.115221
5,AveOccup,-0.041683,0.041683
4,Population,-0.007404,0.007404


#Make Predictions & Evaluate the Model

###Make predictions on the test set

In [32]:
y_pred = lr_model.predict(x_test_scaled)

###Evaluate the model performance

In [35]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np


In [34]:
#MAE: Calculates the average absolute difference between the predicted and actual values.
mae = mean_absolute_error(y_test,y_pred)
mae

0.5255457157103737

In [36]:
#RMSE: Calculates the square root of the average squared difference between the predicted and actual values.
#RMSE gives more weight to larger errors
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
rmse

np.float64(0.7197380534040614)

In [38]:
#R²: (Optional) Calculates the proportion of the variance in the dependent variable that is predictable from the independent variables.
r2=r2_score(y_test,y_pred)
r2

0.6104546894797875