In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from statsmodels.regression.quantile_regression import QuantReg
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.metrics import confusion_matrix

%matplotlib inline

# Loading Data + Splitting

In [2]:
#Import dataset
#For this example, we will use the diabetes dataset from the UCI Machine Learning Repository
diabetes_data = pd.read_csv("142a/Datasets/diabetes_dataset.csv")

#This function will split the dataset into training and testing sets
#It will also add a constant to the training and testing sets for the intercept in the regression model
#This function also 
def split(dataset, target_col):
    data_train, data_test = train_test_split(dataset, test_size=0.2, random_state=4, shuffle=True)
    #X = all columns except what we are trying to predict
    #y = column we are trying to predict
    X_train = sm.add_constant(data_train.drop(columns=[target_col]))
    y_train = data_train[target_col]

    X_test = sm.add_constant(data_test.drop(columns=[target_col]))
    y_test = data_test[target_col]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_testsplit = split(diabetes_data, "Outcome")
X_train

Unnamed: 0,const,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
596,1.0,0,67,76,0,0,45.3,0.194,46
90,1.0,1,80,55,0,0,19.1,0.258,21
734,1.0,2,105,75,0,0,23.3,0.560,53
694,1.0,2,90,60,0,0,23.5,0.191,25
517,1.0,7,125,86,0,0,37.6,0.304,51
...,...,...,...,...,...,...,...,...,...
360,1.0,5,189,64,33,325,31.2,0.583,29
709,1.0,2,93,64,32,160,38.0,0.674,23
439,1.0,6,107,88,0,0,36.8,0.727,31
174,1.0,2,75,64,24,55,29.7,0.370,33


# VIFs

### Multicolinearity
- Occurs when two or more predictors are highly correlated
- Makes the estimated coefficients $\hat{\beta} = (\hat{\beta_0}, \hat{\beta_1}, \ldots , \hat{\beta_p})$ very sensitive to noise in the training data
  - Thus can produce very inaccurate estimates which hurts interpretability and possibly predictive performance
- Tell-tale signs:
  - Some of the estimated coefficients have the “wrong” sign
  - Some of the coefficients are not significantly different from zero
- Multicollinearity can usually be fixed by deleting one or more independent variables


Higher values of VIF imply that the variable is colinear with other features in the dataset. We can use these values to decide which variables to delete from our model!

### Variance Inflation Factor (VIF)
Even before we start training or testing a model, we can measure the colinearity 

- Consider regressing each predictor variable $X_j$ on all of the others:
  $$
  X_j = \alpha_0 + \alpha_1 X_1 + \ldots + \alpha_{j-1}X_{j-1} + \alpha_{j+1}X_{j+1} + \ldots + \alpha_p X_p
  $$
- If the $R^2$ for the above (call it $R^2_j$) is equal to 1, then there exists a perfect linear relationship between $X_j$ 
  and all other independent variables according to the training data
- So, define:
  $$
  \text{VIF}_j = \frac{1}{1 - R^2_j}
  $$

In [3]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
#Look at VIFs for each feature of model
def vifs(X_train):
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_train.columns[1:len(X_train.columns)]  # Exclude the constant column
    vif_data["VIF"] = [variance_inflation_factor(X_train.values, i+1) for i in range(6)]


# Creating a Base Model
Choose a model and the fit the data to it.
The fit() function in both StatsModels and SKLearn is used to train the model based on the data we are passing into it.

#### Regression Models:
If predicting a continuous response variable. Examples: Housing prices, weather.

Ex: Linear Regression, 

#### Classification Models:
If predicting a yes/no outcome. Examples: Identify spam emails, spot cancer cells, 

Ex: Logistic Regression, K-Nearest Neighbors, 

## Linear Regression

Linear Regression can be performed with a few different types of regression models. These include: Ordinary Least Squares, Least Absolute Deviations, Ridge, 

### Ordinary Least Squares

#### Using StatsModels
Use StatsModels for explainability & evaluation of our model.

In [None]:
model_sm_ols = sm.OLS(y_train, X_train).fit()
print(model_sm_ols.summary())

                            OLS Regression Results                            
Dep. Variable:                Outcome   R-squared:                       0.292
Model:                            OLS   Adj. R-squared:                  0.283
Method:                 Least Squares   F-statistic:                     31.20
Date:                Tue, 01 Apr 2025   Prob (F-statistic):           5.10e-41
Time:                        14:56:39   Log-Likelihood:                -311.38
No. Observations:                 614   AIC:                             640.8
Df Residuals:                     605   BIC:                             680.5
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

#### Using SciKitLearn
Use SciKitLearn for better model accuracy & faster computations.

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report

model_skl_ols = LinearRegression().fit(X_train, y_train)


There is no built-in way to analyze your model in SKLearn, but I found a function from StackOverflow that allows you to look at the model the way you would see using StatsModels or R.

Link: (https://stackoverflow.com/questions/26319259/how-to-get-a-regression-summary-in-scikit-learn-like-r-does)

## Logistic Regression

- 1. Classification modeling: logistic regression (`statsmodels`) and Linear Discrimative Analysis (`sklearn.discriminant_analysis`).

- 2. Model evaluation: confusion_matrix, accuracy_score, and the ROC curve (`sklearn.metrics`).

#### Using StatsModels
model_sm_logreg = smf.logit( formula = 'target_col ~ col1 + col2 + col3' + ...,
            data = data_train).fit()

In [14]:
X_train_sm = X_train
X_train_sm["Outcome"] = y_train
model_sm_logreg = smf.logit(formula = 'Outcome ~ Pregnancies + Glucose + BloodPressure + SkinThickness + Insulin + BMI + DiabetesPedigreeFunction + Age',
            data = X_train_sm).fit()
print(model_sm_logreg.summary())

Optimization terminated successfully.
         Current function value: 0.477877
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  614
Model:                          Logit   Df Residuals:                      605
Method:                           MLE   Df Model:                            8
Date:                Tue, 01 Apr 2025   Pseudo R-squ.:                  0.2632
Time:                        21:27:21   Log-Likelihood:                -293.42
converged:                       True   LL-Null:                       -398.21
Covariance Type:            nonrobust   LLR p-value:                 6.089e-41
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                   -8.1735      0.799    -10.227      0.000      -9.740

#### Using SciKitLearn

In [12]:
from sklearn.linear_model import LogisticRegression

model_skl_logreg = LogisticRegression(max_iter=1000).fit(X_train, y_train)

# Using Model for Predictions

Note that so far we have only used our y_train and X_train.
After training our model on our data, we can now start using it to make predictions on our test set!