# **1. Implement Linear Regression or MLR.**

I have implemented MLR here.

In [29]:
# Importing necessary libraries.
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [30]:
# Defining Own Multi-Linear-Regression class.
class MultiLinearRegression:
    def __init__(self, learning_rate=0.0001, epoch=100):
        self.__learning_rate = learning_rate
        self.__epoch = epoch

    def __update_parameter(self, X, y, predictions):
        errors = predictions - y
        d_theta = (2 / self.m) * np.dot(X.T, errors)
        d_bias = (2 / self.m) * np.sum(errors)

        self.__theta -= self.__learning_rate * d_theta
        self.__bias -= self.__learning_rate * d_bias

    def __cost_function(self, y, predictions):
        errors = predictions - y
        return (1 / self.m) * np.dot(errors.T, errors)

    def fit(self, X, y):
        self.m, self.n = X.shape

        self.__theta = np.random.randn(self.n) * 0.01
        self.__bias = 0
        self.__costs = []

        for i in range(self.__epoch):
            predictions = np.dot(X, self.__theta) + self.__bias
            self.__costs.append(self.__cost_function(y, predictions))

            self.__update_parameter(X, y, predictions)
        return self

    def predict(self, X):
        return np.dot(X, self.__theta) + self.__bias

In [17]:
# Reading the csv file into dataframe DF. Then printing the first five rows.
DF = pd.read_csv('/content/Student_Marks.csv')
DF.head()

Unnamed: 0,number_courses,time_study,Marks
0,3,4.508,19.202
1,4,0.096,7.734
2,4,3.133,13.811
3,6,7.909,53.018
4,8,7.811,55.299


In [18]:
# Checking the DF for null values and datatypes.
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   number_courses  100 non-null    int64  
 1   time_study      100 non-null    float64
 2   Marks           100 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 2.5 KB


In [19]:
# Printing the min, max, mean, median(50%), Q1(25%), Q3(75%), standard deviation of each feature.
DF.describe()

Unnamed: 0,number_courses,time_study,Marks
count,100.0,100.0,100.0
mean,5.29,4.07714,24.41769
std,1.799523,2.372914,14.326199
min,3.0,0.096,5.609
25%,4.0,2.0585,12.633
50%,5.0,4.022,20.0595
75%,7.0,6.17925,36.67625
max,8.0,7.957,55.299


In [20]:
# Printing the pearson correlation coefficient between each feature.
DF.corr()

Unnamed: 0,number_courses,time_study,Marks
number_courses,1.0,0.204844,0.417335
time_study,0.204844,1.0,0.942254
Marks,0.417335,0.942254,1.0


In [23]:
# Scaling the datas using MinMaxScalar. Then spliting the data into training and testing set.
scalar = MinMaxScaler()
scaled_DF = scalar.fit_transform(DF)
DF = pd.DataFrame(scaled_DF, columns=DF.columns)

X = DF.iloc[:, 0:2]
y = DF.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# using my own defined MLR class
mlr = MultiLinearRegression()

mlr.fit(X_train, y_train)
y_pred_1 = mlr.predict(X_test)

r2 = r2_score(y_test, y_pred_1)
print(r2)

0.8710244626114814


In [28]:
# using sklearn Linear-Regression library
lr = LinearRegression()

lr.fit(X_train, y_train)
y_pred_2 = lr.predict(X_test)

r2 = r2_score(y_test, y_pred_2)
print(r2)

0.9459936100591213
