# Linear Regression

# 1. Import Libraries

In [None]:
#import statsmodels.api as sm
#pip install statmodels

import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Import Data

In [None]:
df = pd.read_excel("Example.xlsx")
df

**Simple Linear Regression**
- Target - College_GPA
- Predictors - HSGPA
  
**Multiple Linear Regression**
- Target - College_GPA
- Predictors - All except Applicant

# 3. Data Cleaning

### 3.1 Basic Overview

In [None]:
df.head()

### 3.2 Datatypes

In [None]:
df.info()

### 3.3 Missing values (Nulls)

In [None]:
new_df = df.dropna()
new_df

### 3.4 Outliers

In [None]:
new_df = new_df.drop(["Applicant"], axis=1)

new_df

In [None]:
new_df.describe()

In [None]:
sns.boxplot(new_df["HSGPA"])
plt.show()

In [None]:
sns.boxplot(new_df["College_GPA"])
plt.show()

In [None]:
final_df = new_df[(new_df["HSGPA"] > 2.85) & (new_df["College_GPA"] > 1.9)]
final_df

In [None]:
sns.boxplot(final_df["College_GPA"])
plt.show()

In [None]:
final_df.describe()

### 3.5 Duplicates

In [None]:
final_df.drop_duplicates()
final_df

# 4. EDA

Correlation & Multicollinearity

**Create heatmap**
- plt.figure(figsize=(10, 8))
- sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", center=0)
- plt.title("Correlation Heatmap")
- plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

cat_features = final_df.select_dtypes(include = "object").columns
cat_features

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

cat_features = final_df.select_dtypes(include = "object").columns
cat_features

for col in cat_features:
    final_df[col] = le.fit_transform(final_df[col].astype(str))

final_df

In [None]:
final_df = final_df.drop(["Gender", "Enrolled", "Admitted"], axis =1)
final_df

In [None]:
corr = final_df.corr()

In [None]:
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Correlation Heatmap")
plt.show()

##### Correlation --> which predictors are correlated with target?
- HSGPA has the strongest 0.47, weak positive
- SAT/ACT has 0.31, weat positive
- Using these two will give us the best model

##### Multicollinearity --> which predictors are correlated among themselves?
- Edu_parents are correlated among themselves - so only one of them - moderately positive
- Asian and White are negatively correlated. We can remove one of them -- so remove one of them
- Edu_parent1 is higher, White is higher - keeping them

In [None]:
final_df = final_df.drop(["Asian", "Edu_Parent2"], axis =1)
final_df

# 5. Splitting the data

In [None]:
final_df.reset_index()

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X = final_df.drop("College_GPA", axis=1)
y = final_df[["College_GPA"]]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(len(X_train))
print(len(X_test))

# 6. Training the Linear Model

- X_reg = sm.add_constant(X_train) # adding a constant
- reg = sm.OLS(y_train, X_reg).fit()
- pred = reg.predict(X_reg)
- reg.summary()

y = b0 * 1 + b1 x

In [None]:
X_test

In [None]:
X_reg = sm.add_constant(X_train) # adding a constant

X_reg

In [None]:
reg = sm.OLS(y_train, X_reg).fit()
reg.summary()

**MLR**
- 0.301, better than SLR, still not great, but 30% of variation in College GPA is explained. 8% extra!
- Prob of F-static is less than 0.05, model is significant.
- All the variables have p-value less than 0.05, so all of them are significant.

**Interpretations (Simple Linear)**
- R squared value is 0.22 which indicates 22% of the variation in college GPA is explained by HSGPA. Not great as it is not greater than 0.7
- Prob (F-statistic) is very less, definitely less than 0.05, so the model is signficiant or useful. Precisely, atleast one of the independent variables is able to predict "y"
- HSGPA has a p-value of 0.000, which is less than 0.05, so it is signficiant is predicting "y"
- HSGPA coeff is 0.65. If HSGPA goes up by 0.01 then College GPA goes up by 0.0065. 

# 7. Making Predictions

In [None]:
X_test = sm.add_constant(X_test) # adding a constant
pred = reg.predict(X_test)

pred

# 8. Evaluating the Model
- from sklearn.metrics import mean_squared_error
- mse = mean_squared_error(y_train, pred)
- rmse = np.sqrt(mean_squared_error(y_train, pred))

- MSE = Sum(original data - prediction)^2/number of data points
- RMSE = Sqrt(MSE)

In [None]:
from sklearn.metrics import mean_squared_error

# Trainng MSE
pred_train = reg.predict(X_reg)
mse_train = mean_squared_error(y_train, pred_train)
mse_train

import numpy as np
rmse_train = np.sqrt(mse_train)
rmse_train

In [None]:
# Testing MSE

mse = mean_squared_error(y_test, pred)
mse

import numpy as np
rmse_test = np.sqrt(mse)
rmse_test

# 9. Visualizing the Model

- plt.scatter(y_train, pred)
- plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')
- plt.xlabel('Actual Values')
- plt.ylabel('Predicted Values')
- plt.title('Actual vs Predicted Values')
- plt.show()

In [None]:
plt.scatter(y_test, pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.show()

# 10. Reflection

- R squared value is 0.22 which indicates 22% of the variation in college GPA is explained by HSGPA. Not great as it is not greater than 0.7
- Prob (F-statistic) is very less, definitely less than 0.05, so the model is signficiant or useful. Precisely, atleast one of the independent variables is able to predict "y"
- HSGPA has a p-value of 0.000, which is less than 0.05, so it is signficiant is predicting "y"
- HSGPA coeff is 0.65. If HSGPA goes up by 0.01 then College GPA goes up by 0.0065.
- Similar performance on testing and training sets, which is good.
- Viusalizations shows the model often predicts higher than the actual values

# 11. Advanced Models

**https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html**

- from sklearn.preprocessing import PolynomialFeatures
- from sklearn.linear_model import SGDRegressor
- from sklearn.pipeline import make_pipeline
- from sklearn.metrics import mean_squared_error
- from sklearn.preprocessing import StandardScaler, MinMaxScaler

##### ridge = make_pipeline(StandardScaler(), SGDRegressor(penalty="l2"))
##### lasso = make_pipeline(StandardScaler(), SGDRegressor(penalty="l1"))
##### elasticnet = make_pipeline(StandardScaler(), SGDRegressor(penalty="elasticnet"))
##### poly_features = PolynomialFeatures(degree=2, include_bias=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

ridge = make_pipeline(StandardScaler(), SGDRegressor(penalty="l2"))
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

ridge_pred

**Print errors**
print(f"Model: Ridge Regression | MSE: {mse:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f}")


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

print(f"Model: Linear Regression | MSE: {mse:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f}")

In [None]:

mse = mean_squared_error(y_test, ridge_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, ridge_pred)

print(f"Model: Ridge Regression | MSE: {mse:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f}")


In [None]:
lasso = make_pipeline(StandardScaler(), SGDRegressor(penalty="l1"))
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)

mse = mean_squared_error(y_test, lasso_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, lasso_pred)

print(f"Model: Lasso Regression | MSE: {mse:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f}")

In [None]:
enet = make_pipeline(StandardScaler(), SGDRegressor(penalty="elasticnet"))
enet.fit(X_train, y_train)
enet_pred = enet.predict(X_test)

mse = mean_squared_error(y_test, enet_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, enet_pred)

print(f"Model: ElasticNet Regression | MSE: {mse:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV

sgd_pipeline = make_pipeline(StandardScaler(),SGDRegressor())

parameters = {
    'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive'],
    'sgdregressor__penalty': ['l2', 'l1', 'elasticnet', None],
    'sgdregressor__alpha': [0.0001, 0.001, 0.01],
    'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'sgdregressor__eta0': [0.01, 0.001, 0.1],  # initial learning rate
}

clf = GridSearchCV(
    estimator=sgd_pipeline,
    param_grid=parameters,
    n_jobs = -1
)

clf.fit(X_train, y_train)

In [None]:
best = make_pipeline(StandardScaler(),SGDRegressor(alpha=0.01, eta0=0.001, loss='epsilon_insensitive', penalty='l1'))

best.fit(X_train, y_train)
best_pred = best.predict(X_test)

mse = mean_squared_error(y_test, best_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, best_pred)

print(f"Model: Best Regression | MSE: {mse:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f}")

- y = b0 + b1*x1 + b2*x2
- y = b0 + b1*x1^2 + b2*x2^2 + b3*x1*x2 (degree 2) + (x1, x2)
 

In [None]:
X_train

1 + (orginal 5) + (squares of the original 5) + (12, 13, 14, 15, 23, 24, 25, 34, 35, 45)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=3)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.fit_transform(X_test)

len(X_train_poly[1])

In [None]:
poly = make_pipeline(StandardScaler(),SGDRegressor(alpha=0.01, eta0=0.001, loss='epsilon_insensitive', penalty='l1'))
poly.fit(X_train_poly, y_train)
poly_pred = poly.predict(X_test_poly)

mse = mean_squared_error(y_test, poly_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, poly_pred)

print(f"Model: Polynomial Regression | MSE: {mse:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f}")