In [59]:
import numpy as np
import pandas as pd
from warnings import warn

def linear_regression(x  :np.array, y : np.array, learning_rate=0.000001, epsilon=0.9) -> np.array:
    """Linear regression algorithm utilizing gradient decent

    Args:
        x (numpy.array): Array containing x values from dataset
        y (numpy.array): Array containing y valiues from dataset
        learning_rate (float, optional): learning rate value represented by alpha. Defaults to 0.0000001.
        epsilon (float, optional): error threshold to check for convergence. Defaults to 0.9.

    Returns:
         np.array: weights for linear regression
    """
    
    #insert column for y intercept
    regr = np.c_[x, np.ones(len(x))]
    
    #weights
    weights = np.ones(regr.shape[1])
    
    #gradient descent
    norm = 1
    while(norm > epsilon):
        # calculate partial derivitive
        prediction_y = regr @ weights.T
        # print(norm)
        part_deriv = regr.T @ (y - prediction_y)

        # calculate normal

        norm = np.sum(np.sqrt(np.square(part_deriv)))
        # adjust weights based on gradient

        weights = weights.T + (learning_rate * part_deriv)
        if (np.isnan(norm)):
          warn('diverged')

          
    return weights
def mean_squared_error(y, predection_y) -> float:
    """Get mean squared error

    Args:
        y (np.array): Values for y
        predection_y (np.array): Predicted values for y

    Returns:
        float: mean squared error
    """
    
    #sum of (y_n - pred_y_n) divided by num of y
    error = np.sum(np.square(y - predection_y))/float(len(y))
    
    return error
    
def predict_y(x:np.array, weights:np.array) -> np.array:
    """Predict y value

    Args:
        x (np.array): independant variables
        weights (np.array): linear regression weights

    Returns:
        np.array: predicted values
    """
  # y =       m      *    x          +     b
    y = weights[:-1] @ np.array(x).T + weights[-1]
    
    return y

In [60]:
# get data from file
data = pd.read_csv("Real estate.csv")
# drop first column which is not usefull 
data = data.drop("No",axis=1)
# show data
data

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.917,19.5,306.59470,9,24.98034,121.53951,42.2
2,2013.583,13.3,561.98450,5,24.98746,121.54391,47.3
3,2013.500,13.3,561.98450,5,24.98746,121.54391,54.8
4,2012.833,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...
409,2013.000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,2012.667,5.6,90.45606,9,24.97433,121.54310,50.0
411,2013.250,18.8,390.96960,7,24.97923,121.53986,40.6
412,2013.000,8.1,104.81010,5,24.96674,121.54067,52.5


In [61]:
#convert dataframe to numpy array
data = data.to_numpy(dtype=np.float64)

# add all but last column to array - independent variables (x)
x = data[:, :-1]

# add last column to array - dependent variable (y)
y = data[:, -1]



In [63]:
weights = linear_regression(x, y, learning_rate=0.00001, epsilon=0.99)
weights

  norm = np.sum(np.sqrt(np.square(part_deriv)))
  part_deriv = regr.T @ (y - prediction_y)
  prediction_y = regr @ weights.T
  warn('diverged')


KeyboardInterrupt: 