1) From Scratch

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import r2_score

class LinearRegression:
    def __init__(self, X, Y):
        ones = np.ones((X.shape[0], 1))
        X = np.append(ones, X, axis=1)
        self.X = X
        self.Y = Y
        self.m = X.shape[0]
        self.n = X.shape[1]
        self.theta = np.random.randn(X.shape[1])

    def computeCostFunction(self):
        h = np.matmul(self.X, self.theta)
        self.J = (1 / (2 * self.m)) * np.sum((h - self.Y) ** 2)
        return self.J

    def performGradientDescent(self, num_of_iter, alpha):
        self.Cost_history = []
        self.theta_history = []
        for x in range(num_of_iter):
            h = np.matmul(self.X, self.theta)
            J = self.computeCostFunction()
            self.Cost_history.append(J)
            self.theta_history.append(self.theta)
            temp = h - self.Y
            self.theta = self.theta - (alpha / self.m) * (self.X.T.dot(temp))
        return self.theta, self.Cost_history, self.theta_history

    def predict(self, X_test, Y_test):
        ones = np.ones((X_test.shape[0], 1))
        X_test = np.append(ones, X_test, axis=1)
        self.Y_pred = np.matmul(X_test, self.theta)
        self.error_percentage = (abs(self.Y_pred - Y_test) / Y_test) * 100
        return self.Y_pred, self.error_percentage

    def predictUsingNormalEquation(self, X_test, Y_test):
        ones = np.ones((X_test.shape[0], 1))
        X_test = np.append(ones, X_test, axis=1)
        inv = np.linalg.inv(np.matmul(self.X.T, self.X))
        self.w = np.matmul(np.matmul(inv, self.X.T), self.Y)
        y_pred = np.matmul(X_test, self.w)
        return y_pred, (abs(Y_test - y_pred) / Y_test) * 100

# Load the diabetes dataset
data = pd.read_csv('/content/drive/MyDrive/machine_learning/datasets/diabetes.csv')

# Extract features (X) and target variable (Y)
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

# Create a LinearRegression object
model = LinearRegression(X, Y)

# Perform gradient descent
num_of_iter = 1000
alpha = 0.01
theta_final, cost_history, theta_history = model.performGradientDescent(num_of_iter, alpha)

# Split the data into training and testing sets
split_index = int(0.8 * len(X))
X_train, X_test = X[:split_index], X[split_index:]
Y_train, Y_test = Y[:split_index], Y[split_index:]

# Use the trained model to predict on the test set
Y_pred, error_percentage = model.predict(X_test, Y_test)

print("True Values:", Y_test)
print("Predicted Values:", Y_pred)


True Values: [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0
 0 0 0 0 1 0 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0
 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 0 1
 0 0 0 0 1 0]
Predicted Values: [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan]


  self.J = (1 / (2 * self.m)) * np.sum((h - self.Y) ** 2)
  h = np.matmul(self.X, self.theta)
  h = np.matmul(self.X, self.theta)


2) Using SCIKIT

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

data = pd.read_csv('/content/drive/MyDrive/machine_learning/datasets/diabetes.csv')

X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print("True Values:", Y_test)
print("Predicted Values:", Y_pred)

r2_accuracy = r2_score(Y_test, Y_pred)
print("Accuracy:", r2_accuracy)


True Values: [0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 1 1 0 1 1
 1 0 1 0 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 1
 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 1 0
 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 1 0 1 1 0 1 1 1 0 0 0 0 0 0 0 1 0
 0 1 0 0 1 0]
Predicted Values: [ 0.33550028  0.23809869  0.1510522   0.2401365   0.48142376  0.45257375
 -0.17450469  0.60662287  0.52417796  0.70476953  0.32360466  0.85290601
  0.38466612  0.36056948  0.09946712  0.41539557  0.17869123  0.07782301
  0.80730861  0.51299477  0.28090594  0.08303057  0.5099157   0.11381771
  0.51325022  0.82528549  0.17892718 -0.0594202   0.28338572  0.16407949
  0.83851225  0.80737515  0.68154389  0.7649502   0.56140297  0.62123131
  1.06134554  0.30990775  0.51752336  0.63691482  0.07075333  0.57757007
  0.55015462  0.37541745 -0.07644182  0.50119208  0.59600162  0.27464761
  0.42477995  0.9941898   0.00969584  0.61763578  0.73395288  0.3109097