# Multiple regression

- a regression model with more than one explanatory variable


# One explanatory variable

- One independent variable

### When the independent variable is numeric

- 1 intercept coefficient
- 1 slope coefficient
- Visualize with `regplot`

In [1]:
# from statsmodels.formula.api import ols
# model = ols("y_var ~ x_var_numeric", data=df).fit()
# print(model.params)

In [3]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.regplot(x="x_var_numeric",y="y_var",data=df,ci=None)
# plt.show()

### When the independent variable is categorical


- Add `+ 0` when modeling with `ols()`
- 1 intercept coefficient for each category
- Visualize with `boxplot`


In [4]:
# from statsmodels.formula.api import ols
# model = ols("y_var ~ x_var_categorical + 0", data=df).fit()
# print(model.params)

In [5]:
# sns.boxplot(x="x_var_categorical",
#             y="y_var",
#             data=df,
#             showmeans=True)

# Multiple explanatory variable


- 1 categorical independent variable
- 1 numerical independent variable
- 1 slope coefficient
- 1 intercept coefficient for each category
- Visualize with `scatterplot` and `axline` for each category 
- Since all slopes have same values, different lines will be parallel (hence, Parallel slopes regression)

In [2]:
# from statsmodels.formula.api import ols
# model = ols("y_var ~ x_var_numeric + x_var_categorical + 0", data=df).fit()
# print(model.params)

In [6]:
# coeffs = model.params
# print(coeffs)

# cat1, cat2, cat3, cat4, slope = coeffs

# sns.scatterplot(x="x_var_numeric",
#                 y="y_var",
#                 hue="x_var_categorical",
#                 data=df)
# plt.axline(xy1=(0, cat1), slope=slope, color="blue")
# plt.axline(xy1=(0, cat1), slope=slope, color="green")
# plt.axline(xy1=(0, cat1), slope=slope, color="red")
# plt.axline(xy1=(0, cat1), slope=slope, color="orange")

# Predicting parallel slopes

1. Create possible combinations with cartesian product
2. Make a dataframe with the combinations
3. Use the created dataframe for predictions

In [7]:
# import pandas as pd
# import numpy as np

# from itertools import product

# col_numeric = np.arange(5, 61, 5)
# col_categorical = df["category"].unique()

# combinations = product(col_numeric, col_categorical)

# prediction_data = pd.DataFrame(combinations,
#                                 columns=['col_numeric','col_categorical'])

# print(expl_data_length)

# prediction_data = prediction_data.assign(
# predicted_col = model.predict(prediction_data)
# )

# Visualizing the predictions


1. Create parallel lines for each category
2. Create scatterplot for original dataset
3. Create scatterplot for prediction dataset

In [8]:
# plt.axline(xy1=(0, cat1), slope=slope, color="blue")
# plt.axline(xy1=(0, cat2), slope=slope, color="green")
# plt.axline(xy1=(0, cat3), slope=slope, color="red")
# plt.axline(xy1=(0, cat4), slope=slope, color="orange")

# sns.scatterplot(x="x_var_numeric",
# y="y_var",
# hue="x_var_categorical",
# data=df)
# sns.scatterplot(x="x_var_numeric",
# y="y_var",
# color="black",
# data=prediction_data)

# Manually calculating predictions for simple linear regression

1. Extract model parameters
2. Create the predicted column with formula : `intercept + slope * explanatory_data`

In [9]:
# coeffs = model.params
# print(coeffs)
# intercept, slope = coeffs
# explanatory_data = pd.DataFrame({"to_be_predicted": np.arange(5, 61, 5)})
# prediction_data = explanatory_data.assign(
# theoretical_prediction = intercept + slope * explanatory_data["to_be_predicted"]
# )
# print(prediction_data)

# Manually calculating predictions for multiple linear regression


1. Extract model parameters and intercepts
2. Choose specific condition with `np.select()` [This acts like when ... then]
3. Create the predicted column with formula : `intercept_of_choice + slope * explanatory_data`


In [10]:
# coeffs = mdl_mass_vs_both.params
# print(coeffs)

# cat1, cat2, cat3, slope = coeffs

# conditions = [
# explanatory_data["class"] == "cat1",
# explanatory_data["class"] == "cat2",
# explanatory_data["class"] == "cat3"
# ]
# choices = [cat1, cat2, cat3]
# intercept = np.select(conditions, choices)
# print(intercept)

# prediction_data = explanatory_data.assign(
# intercept = np.select(conditions, choices),
# theoretical_prediction = intercept + slope * explanatory_data["to_be_predicted"])
# print(prediction_data)



# Model performance metrics

- Coefficient of determination (R-squared): 
    - how well the linear regression line fits the observed values.
    - Larger is beter
    - extract : `print(model.rsquared)`
    - More explanatory variables increases this value = overfitting
    - Adjusted coefficient of determination penalizes more explanatory variables.
        - Penalty is noticeable when R-square is small
        - extract: `print(model.rsquared_adj)`
- Residual standard error (RSE): 
    - the typical size of the residuals.
    - Smaller is better
    - RSE: `print(np.sqrt(model.mse_resid))`
    - MSE: `print(model.mse_resid)`