In [1]:
!pip install numpy scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np

## Load Dataset

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

    :The target variable is the median house value for California districts, expressed in hundreds of thousands of dollars ($100,000).

In [3]:
from sklearn.datasets import fetch_california_housing

(x,y) = fetch_california_housing(return_X_y = True)

print(x.shape)
print(y.shape)

(20640, 8)
(20640,)


In [4]:
y[0:10]

array([4.526, 3.585, 3.521, 3.413, 3.422, 2.697, 2.992, 2.414, 2.267,
       2.611])

## Split Dataset

In [5]:
from sklearn.model_selection import train_test_split

# we set aside 20% of the data for testing, and use the remaining 80% for training
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## Linear Regresssion

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

testing_mse = mean_squared_error(y_test, y_pred)
print(testing_mse)

0.5291307210017056


In [7]:
y_test[0:10]

array([0.834, 0.949, 1.565, 1.681, 3.229, 2.659, 2.111, 1.139, 3.509,
       0.683])

In [8]:
y_pred[0:10]

array([1.85733601, 1.41005433, 2.34000609, 2.71307003, 2.32259047,
       2.46126821, 1.59725078, 1.02374913, 3.21888182, 1.20937552])

In [9]:
model.score(x_test,y_test)

0.6165556649743994

## Polynomial Regresssion

Polynomial of Degree Three

In [10]:
from sklearn.preprocessing import PolynomialFeatures

x_train_poly = PolynomialFeatures(degree = 2).fit_transform(x_train)
model = LinearRegression()
model.fit(x_train_poly, y_train)

x_test_poly = PolynomialFeatures(degree = 2).fit_transform(x_test)
y_pred = model.predict(x_test_poly)

testing_mse = mean_squared_error(y_test, y_pred)
print(testing_mse)

0.5999932365984371


In [11]:
x_test_poly.shape

(4128, 45)

In [12]:
model.score(x_test_poly,y_test)

0.5652038362244252

## Regularization Ridge

In [13]:
from sklearn.linear_model import Ridge

In [14]:
model = Ridge(alpha = 0.01)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

testing_mse = mean_squared_error(y_test, y_pred)
print(testing_mse)

0.5291307283275838


In [15]:
model.score(x_test,y_test)

0.6165556596655666

## Regularization ElasticNet

In [16]:
from sklearn.linear_model import ElasticNet

In [17]:
model = ElasticNet(alpha = 0.01)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

testing_mse = mean_squared_error(y_test, y_pred)
print(testing_mse)

0.5316799690145549


In [18]:
model.score(x_test,y_test)

0.6147083053895092