<a href="https://colab.research.google.com/github/surendarmanoj/data-science/blob/main/Multiple_linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
def generate_student_dataset(num_students=100, random_seed=42):

    random.seed(random_seed)
    ids = []
    attendance_list = []
    hours_studied_list = []
    past_gpa_list = []
    sleep_hours_list = []
    final_exam_scores = []

    for i in range(num_students):
        student_id = i + 1
        attendance = round(random.uniform(60, 100), 1)
        hours_studied = random.randint(5, 20)
        past_gpa = round(random.uniform(2.0, 4.0), 2)
        sleep_hours = round(random.uniform(5.0, 9.0), 1)
        noise = random.gauss(0, 5)
        score = (
            20
            + 0.5 * attendance
            + 1.5 * hours_studied
            + 4.0 * past_gpa
            + 0.4 * sleep_hours
            + noise
        )
        score = max(0, min(100, round(score, 1)))  # clamp to 0–100

        # Append to lists
        ids.append(student_id)
        attendance_list.append(attendance)
        hours_studied_list.append(hours_studied)
        past_gpa_list.append(past_gpa)
        sleep_hours_list.append(sleep_hours)
        final_exam_scores.append(score)

    # Create DataFrame
    df = pd.DataFrame({
        "ID": ids,
        "Attendance": attendance_list,
        "Hours_Studied": hours_studied_list,
        "Past_GPA": past_gpa_list,
        "Sleep_Hours": sleep_hours_list,
        "Final_Exam_Score": final_exam_scores
    })

    return df


    # Generate dataset with default parameters
df_students = generate_student_dataset(num_students=100, random_seed=42)
print(df_students)


     ID  Attendance  Hours_Studied  Past_GPA  Sleep_Hours  Final_Exam_Score
0     1        85.6              5      3.48          6.0              88.1
1     2        89.6              7      3.18          5.1              91.8
2     3        63.7             12      3.01          5.1              86.2
3     4        81.8             12      2.90          6.1              99.8
4     5        94.8             10      3.40          6.4             100.0
..  ...         ...            ...       ...          ...               ...
95   96        81.7             18      2.12          8.3             100.0
96   97        75.6              5      3.71          8.1              83.0
97   98        74.1             17      3.71          8.8             100.0
98   99        76.8             12      2.98          6.1              85.8
99  100        86.8             17      3.45          8.4             100.0

[100 rows x 6 columns]


In [3]:
df_students.head()

Unnamed: 0,ID,Attendance,Hours_Studied,Past_GPA,Sleep_Hours,Final_Exam_Score
0,1,85.6,5,3.48,6.0,88.1
1,2,89.6,7,3.18,5.1,91.8
2,3,63.7,12,3.01,5.1,86.2
3,4,81.8,12,2.9,6.1,99.8
4,5,94.8,10,3.4,6.4,100.0


In [4]:
fig = px.scatter_3d(df_students, x = 'Attendance',y = 'Hours_Studied', z = 'Final_Exam_Score')
fig.show()

In [5]:
X = df_students.iloc[:,1:3]
y = df_students.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=3)


In [6]:
lr = LinearRegression()

In [7]:
lr.fit(X_train, y_train)

In [8]:
y_pred = lr.predict(X_test)

In [9]:
print("MAE", mean_absolute_error(y_test, y_pred))
print("MSE", mean_squared_error(y_test, y_pred))
print("R2 Score", r2_score(y_test, y_pred))

MAE 4.1540755546227235
MSE 23.111913922324916
R2 Score 0.6313736467170104


In [10]:
x = np.linspace(-5, 5, 10)
y = np.linspace(-5, 5, 10)
xGrid, yGrid = np.meshgrid(x, y)
final = np.vstack((xGrid.ravel(), yGrid.ravel())).T
z_final = lr.predict(final).reshape(10, 10)
z = z_final


X does not have valid feature names, but LinearRegression was fitted with feature names



In [11]:
fig = px.scatter_3d(df_students, x = 'Attendance', y='Hours_Studied', z='Final_Exam_Score')

fig.add_trace(go.Surface(x = x, y = y, z =z ))

fig.show()

In [12]:
lr.coef_

array([0.31791152, 1.07406713])

In [13]:
lr.intercept_

52.771661054915576