In [3]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load the housing dataset
housing = fetch_california_housing()

In [4]:
# creating data frame
X = pd.DataFrame(housing.data, columns=housing.feature_names) 

# Target: median housing prices
y = pd.Series(housing.target, name='med_house_value')

In [None]:
# First 5 rows of the feature dataset
print(X.head())

# Print the feature names
print("\nFeature names:")
print(y.head())

# Check for missing values
print("\nMissing values in the dataset:")
print(X.isnull().sum())


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  

Feature names:
0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: med_house_value, dtype: float64

Missing values in the dataset:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.

In [14]:
# Generate summary statistics
print(X.describe())
print(y.describe())

             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.563400     18.000000      4.440716      1.006079    787.000000   
50%        3.534800     29.000000      5.229129      1.048780   1166.000000   
75%        4.743250     37.000000      6.052381      1.099526   1725.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude  
count  20640.000000  20640.000000  20640.000000  
mean       3.070655     35.631861   -119.569704  
std       10.386050      2.135952      2.003532  
min        0.692308     32.540000   -124.350000  
25%        2.429741     33.930000   -1

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
# Split the raw data (80% training, 20% testing)
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize and train the linear regression model on unscaled data
lin_reg = LinearRegression()
lin_reg.fit(X_train_raw, y_train)

# Make predictions on the test set
y_pred = lin_reg.predict(X_test_raw)

In [20]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
rmse  = root_mean_squared_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)

# print values as floats w/ 2 decimal places
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


Mean Squared Error: 2.17
Root Squared Error: 1.47
R² Score: -0.59


What does the R² score tell us about model performance?

Which features seem to have the strongest impact on predictions based on the model’s coefficients?

How well do the predicted values match the actual values?

In [None]:
from sklearn.preprocessing import StandardScaler
