In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import seaborn as sns
from sklearn import datasets
from sklearn.datasets import fetch_california_housing
import pandas as pd

In [2]:
housing = fetch_california_housing()

In [3]:
housing.target_names

['MedHouseVal']

In [4]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [5]:
df = pd.DataFrame(housing.data,columns=housing.feature_names)

In [7]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [8]:
df.shape

(20640, 8)

In [9]:
X = df.loc[:,[col for col in df.columns if col!=housing.target_names[0]]]
y = housing.target

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=123,test_size=0.2)

In [11]:
X_train.shape

(16512, 8)

In [12]:
X_test.shape

(4128, 8)

In [13]:
y_train.shape

(16512,)

In [14]:
y_test.shape

(4128,)

In [15]:
from statsmodels.api import OLS,GLM
import statsmodels.api as sm

In [16]:
model = OLS(y_train,sm.add_constant(X_train)).fit()


In [17]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.605
Model:                            OLS   Adj. R-squared:                  0.605
Method:                 Least Squares   F-statistic:                     3160.
Date:                Fri, 12 Jan 2024   Prob (F-statistic):               0.00
Time:                        00:27:29   Log-Likelihood:                -18126.
No. Observations:               16512   AIC:                         3.627e+04
Df Residuals:                   16503   BIC:                         3.634e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -36.2556      0.740    -49.012      0.0

In [19]:
y_train_pred = model.predict(sm.add_constant(X_train))

In [20]:
import pickle

with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [21]:
with open('linear_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [23]:
loaded_model.predict(sm.add_constant(X_train))

9950     2.279931
3547     2.798978
4448     1.521155
6984     2.468780
4432     1.738717
           ...   
7763     2.122214
15377    2.052562
17730    2.745928
15725    2.584802
19966    0.949276
Length: 16512, dtype: float64

In [24]:
X_train.shape

(16512, 8)

In [25]:
X_test.shape

(4128, 8)

In [26]:
X_test = sm.add_constant(X_test)

In [28]:
y_test_pred = loaded_model.predict(X_test)

In [29]:
from sklearn.metrics import mean_squared_error

In [124]:
mean_squared_error()

(16512, 9)