In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [None]:
# load dataset
ds = datasets.fetch_california_housing()
X = ds.data
y = ds.target

# remove very cheap or very expensive homes (saturates =< 0.15 or >= 5)
ind = (y > 0.15) & (y < 5)
X = X[ind,:]
y = y[ind]

# transform target - more Gaussian
y = np.log(y)

# scale attributes
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=0)

In [None]:
# info about the dataset
print(ds.DESCR)

In [None]:
# -------------------------------------------------------------
# Step 1 - create a linear regression object
# -------------------------------------------------------------
model = linear_model.LinearRegression()

# -------------------------------------------------------------
# Step 2 - train model
# -------------------------------------------------------------
model.fit(X_train, y_train)

# -------------------------------------------------------------
# Step 3 - make predictions
# -------------------------------------------------------------
y_pred = model.predict(X_test)

In [None]:
# info about the model
print('Model coeffs:\n', model.coef_)
print('Model intercept:\n', model.intercept_)
print('Test RMSE:\n', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:\n', r2_score(y_test, y_pred))

In [None]:
# Plot actual vs predicted
plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred, c='red')

p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('Actual', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()