# Linear Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

## Simple Linear Regression

$y = b_0 + b_1 X$

$b_0$ - intercept

$b_1$ - slope coefficient

In [None]:
# X - area, y - price
X = np.array([40, 60, 50, 80, 200, 150, 75, 45, 140, 100, 120, 165, 135])
y = np.array([100000, 120000, 125000, 150000, 300000, 220000, 105000, 110000, 200000, 100000, 165000, 240000, 230000])

In [None]:
plt.scatter(X, y)
plt.xlabel("Area")
plt.ylabel("Price")
plt.show()

In [None]:
sns.heatmap(np.corrcoef(X, y), annot=True, fmt=".2f", cmap="crest")
plt.show()

$b_1 = \frac{\operatorname{Cov}(X, Y)}{\operatorname{Var}(X)}$

$b_1 = \frac{\sum_{i=1}^{n} \left(x_i - \bar{x}\right)\left(y_i - \bar{y}\right)}{\sum_{i=1}^{n} \left(x_i - \bar{x}\right)^2}$

In [None]:
b1 = sum((X - X.mean()) * (y - y.mean())) / sum((X - X.mean())**2)
b1

In [None]:
b0 = y.mean() - b1 * X.mean()
b0

In [None]:
y_pred = b0 + b1 * X
y_pred

In [None]:
plt.scatter(X, y)
plt.plot(X, y_pred, c="r", label="Linear regression")
plt.xlabel("Area")
plt.ylabel("Price")
plt.legend()
plt.show()

In [None]:
X_test = np.array([50, 100, 150, 200, 250])
y_pred = b0 + b1 * X_test
y_pred

In [None]:
plt.scatter(X, y)
plt.scatter(X_test, y_pred, label="Predicted values")
plt.xlabel("Area")
plt.ylabel("Price")
plt.legend()
plt.show()

## Multiple Linear Regression

$y = b_0 + b_1 X_1 + b_2 X_2 + \dots + b_n X_n = b^\top X$

In [None]:
diabetes = load_diabetes()
diabetes.feature_names

In [None]:
X = diabetes.data[:, [0, 1, 2]]
y = diabetes.target
X.shape, y.shape

In [None]:
labels = [diabetes.feature_names[i] for i in [0, 1, 2]] + ["target"]
df = pd.DataFrame(np.hstack((X, y.reshape(-1, 1))), columns=labels)
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="crest")
plt.show()

$b = \left( X^\top X \right)^{-1} X^\top y$

In [None]:
X = np.hstack((np.ones((X.shape[0], 1)), X))
X

In [None]:
b = np.linalg.inv(X.T @ X) @ X.T @ y
b

In [None]:
y_pred = X @ b
y_pred[:10]

$MAE = \frac{1}{n} \sum |y_i - \hat{y}_i|$

In [None]:
print(f"MAE: {mean_absolute_error(y, y_pred):.6f}")

$MSE = {\frac{1}{n} \sum (y_i - \hat{y}_i)^2}$

In [None]:
print(f"MSE: {mean_squared_error(y, y_pred):.6f}")

$RMSE = \sqrt{\frac{1}{n} \sum (y_i - \hat{y}_i)^2}$

In [None]:
print(f"RMSE: {root_mean_squared_error(y, y_pred):.6f}")

$R^2 = 1 - \frac{SS_{res}}{SS_{tot}}$

$SS_{res} = \sum (y_i - \hat{y}_i)^2$

$SS_{tot} = \sum (y_i - \bar{y})^2$

In [None]:
print(f"R^2: {r2_score(y, y_pred):.6f}")

# Student Performance (Multiple Linear Regression)

[Student Performance Dataset on Kaggle](https://www.kaggle.com/datasets/nikhil7280/student-performance-multiple-linear-regression)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [None]:
df = pd.read_csv("../data/Student_Performance.csv")
df.head()

In [None]:
df.info()

In [None]:
df_num = df.select_dtypes(include=["number"])
sns.heatmap(df_num.corr(), annot=True, fmt=".2f", cmap="crest")
plt.show()

In [None]:
plt.figure(figsize=(12, 10))

plt.subplot(221)
plt.title("Previous Scores")
plt.hist(df_num["Previous Scores"], bins=30)

plt.subplot(222)
plt.title("Performance Index")
plt.hist(df_num["Performance Index"], bins=30)

plt.subplot(223)
plt.title("Hours Studied")
plt.hist(df_num["Hours Studied"], bins=9)

plt.subplot(224)
plt.title("Sleep Hours")
plt.hist(df_num["Sleep Hours"], bins=9)

plt.show()

In [None]:
X = np.array(df_num["Previous Scores"])
y = np.array(df_num["Performance Index"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train.reshape(-1, 1), y_train)
model.intercept_, model.coef_

In [None]:
y_pred = model.predict(X_train.reshape(-1, 1))
y_pred

In [None]:
plt.scatter(X_train, y_train, label="Train data")
plt.plot(X_train, y_pred, c="r", label="Linear Regression")
plt.xlabel("Performance Index")
plt.ylabel("Previous Scores")
plt.legend()
plt.show()

$x' = \frac{x - \mu}{\sigma}$

In [None]:
scaler = StandardScaler()
data_std = scaler.fit_transform(df_num)

X = data_std[:, :-1]
y = data_std[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model.fit(X_train, y_train)
model.intercept_, model.coef_

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(f"MAE: {mean_absolute_error(y_test, y_pred):.6f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.6f}")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.6f}")
print(f"R^2: {r2_score(y_test, y_pred):.6f}")