## Imports

In [8]:
import pickle
import joblib
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## Read the Data

In [None]:
df = pd.read_csv('housing.csv').iloc[:, :-1].dropna()
# we need all the rows, all the columns except the last one, and then we are just dropping missing values
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


In [10]:
X = df.drop(columns='median_house_value')
y = df.median_house_value.copy()

print(X.shape, y.shape)

(20433, 8) (20433,)


## Train the Model

In [11]:
model = LinearRegression()
model.fit(X, y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [12]:
y_pred = model.predict(X)
y_pred

array([411270.2074602 , 415943.95527547, 380534.16332107, ...,
        24989.18869283,  37961.51486097,  55555.76850636], shape=(20433,))

In [13]:
r2_score(y, y_pred)

0.6369116857335633

In [14]:
model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


## Pickle

In [15]:
with open('model.pkl', 'wb') as f:
  pickle.dump(model, f)

In [16]:
with open('model.pkl', 'rb') as f:
  pickle_model = pickle.load(f)

In [17]:
pickle_model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [18]:
y_pred_pickle = pickle_model.predict(X)
y_pred_pickle

array([411270.2074602 , 415943.95527547, 380534.16332107, ...,
        24989.18869283,  37961.51486097,  55555.76850636], shape=(20433,))

In [19]:
r2_score(y, y_pred_pickle)

0.6369116857335633

## Joblib

In [20]:
joblib.dump(model, 'model.joblib')

['model.joblib']

In [21]:
joblib_model = joblib.load('model.joblib')
joblib_model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [22]:
y_pred_joblib = joblib_model.predict(X)
y_pred_joblib

array([411270.2074602 , 415943.95527547, 380534.16332107, ...,
        24989.18869283,  37961.51486097,  55555.76850636], shape=(20433,))

In [23]:
r2_score(y, y_pred_joblib)

0.6369116857335633