In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
# get the dataset
# function to download the data and extract them
import os
import tarfile
import urllib.request

PROJECT_ROOT_DIR = "/home/vikas/machine Learning/ML Alogirthms Scratch"
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join(PROJECT_ROOT_DIR,"datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [11]:
fetch_housing_data()

In [12]:
import pandas as pd

def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [13]:
housing_data = load_housing_data()

In [14]:
housing_data.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY


In [16]:
# train set and test set
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing_data, test_size=0.2, random_state=42)

In [89]:
# let's implement of linear regression without Regularization
class LinearRegression1:
    
    def __init__(self):
        self.w = [] # random values
        print("LinearRegression")
        
    def fit(self, X, y):
        w_0 = np.ones(len(X)).reshape(len(X), 1) # to add intercept/bias in the equation
        X = np.concatenate((w_0, X), axis=1)
        XXT_inv = np.linalg.inv(np.dot(X.T, X))
        XXT_inv_XT = np.dot(XXT_inv, X.T)
        self.w = np.dot(XXT_inv_XT, y)
        self.w = np.array(self.w)
        
    def predict(self, X):
        w_0 = np.ones(len(X)).reshape(len(X), 1)
        X = np.concatenate((w_0, X), axis=1)
        return np.dot(X, self.w)
    
    def __str__(self):
        print(self.w)

In [90]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 14196 to 15795
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
 9   ocean_proximity     16512 non-null  object 
dtypes: float64(9), object(1)
memory usage: 2.0+ MB


In [91]:
y = train_set[["median_house_value"]]
X = train_set.drop("median_house_value", axis=1)
X = X.drop("ocean_proximity", axis=1)

In [92]:
X.head(1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596


In [93]:
y.shape

(16512, 1)

In [94]:
lin_reg_cus = LinearRegression1()

LinearRegression


In [95]:
lin_reg_cus.fit(X,y)

In [96]:
final_predict = lin_reg_cus.predict(X)

In [97]:
lin_reg_cus.w.shape

(9, 1)

In [98]:
from sklearn.metrics import mean_squared_error
lin_mse_cus = mean_squared_error(y, final_predict)
print(lin_mse_cus)

4811134397.884197


In [99]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)
final_predict = lin_reg.predict(X)
lin_mse = mean_squared_error(y, final_predict)
print(lin_mse)

4811134397.884195


In [122]:
# implement linear Regression with Regularizer
class LinearRegression_Reg:
    
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.w = []
        print("LinearRegression with Regularization")
        
    def fit(self, X, y):
        w_0 = np.ones(len(X)).reshape(len(X), 1) # to add intercept/bias in the equation
        X = np.concatenate((w_0, X), axis=1)
        XXT = (np.dot(X.T, X))
        alphaI = self.alpha * np.eye(len(X[0]), len(X[0]))
        alphaI[0,0] = 0
        print(alphaI)
        temp = XXT + alphaI
        XXT_alphaI = np.linalg.inv(temp)
        XXTalphaI_inv_XT = np.dot(XXT_alphaI, X.T)
        self.w = np.dot(XXTalphaI_inv_XT, y)
        self.w = np.array(self.w)
        
    def predict(self, X):
        w_0 = np.ones(len(X)).reshape(len(X), 1)
        X = np.concatenate((w_0, X), axis=1)
        return np.dot(X, self.w)
    

In [123]:
lin_reg_reg_cus = LinearRegression_Reg(1)
lin_reg_reg_cus.fit(X,y)

LinearRegression with Regularization
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [124]:
final_predict = lin_reg_reg_cus.predict(X)

In [125]:
lin_mse_cus = mean_squared_error(y, final_predict)
print(lin_mse_cus)

4811134444.917023


In [110]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X,y)

Ridge(alpha=0, solver='cholesky')

In [111]:
predictions = ridge_reg.predict(X)
ridge_mse = mean_squared_error(y, predictions)
print(ridge_mse)

4811134397.884197
