In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from my_ml_package.data.synthetic import variance_sample_for_decision_tree

## Overfitting and Underfitting
* Question: BIG or SMALL?
  |                          | Training Loss/Accuracy/Error                 | Test Loss/Accuracy/Error                     |
  |------------------------------------|-----------------------------------|-----------------------------------|
  |Overfitting    |  Small   | Big  |
  |Underfitting   |  Big   |  Big |
* Generalization
  * Generalization Error or Test Error
    <!-- \operatorname{Err}_{\mathcal{D}_\text{tr}}= -->
    $$\mathrm{E}[\operatorname{Err}(Y_\text{test}, f(X_\text{test})) \mid Y_\text{train}, X_\text{train}]$$ 
    
  <!-- * Expected Generalization Error
    $$\operatorname{Err}=\mathrm{E}[L(Y_\text{test}, \hat{f}(X_\text{test}))]=\mathrm{E}\left[\operatorname{Err}_{\text{train}}\right]$$  -->
  * Generalization Gap >0
    $$ \mathrm{E}[\operatorname{Err}(Y_\text{test}, f(X_\text{test}))] - \operatorname{Err}(Y_\text{train}, f(X_\text{train}))$$
    $$ \operatorname{Acc}(Y_\text{train}, f(X_\text{train})) - \mathrm{E}[ \operatorname{Acc}(Y_\text{test}, f(X_\text{test}))] $$





In [10]:
# week 6
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_model_iris = LogisticRegression(max_iter=200)
logistic_model_iris.fit(X_train, y_train)
y_pred_test = logistic_model_iris.predict(X_test)
y_pred_train = logistic_model_iris.predict(X_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Accuracy test: ", accuracy_test)
print("Accuracy train: ", accuracy_train)
print("Generalization gap: ", accuracy_train - accuracy_test)



Accuracy test:  1.0
Accuracy train:  0.975
Generalization gap:  -0.025000000000000022


In [12]:
# week 8
df = pd.read_csv('data/Data_for_UCI_named.csv')
X = df.drop('stabf', axis=1).to_numpy()
y = df['stabf'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
print("Accuracy test: ", accuracy_test)
print("Accuracy train: ", accuracy_train)
print("Generalization gap: ", accuracy_train - accuracy_test)


Accuracy test:  1.0
Accuracy train:  0.975
Generalization gap:  -0.025000000000000022


**Is the 100% test accuracy due to the easy examples from splitting?**
* Looking for multiple test sets for expectation?
* Looking for the expected ability of the ML model (Do not fix training data but do fix the task)  
    $$\mathrm{E}[\operatorname{Err}(Y_\text{test}, f(X_\text{test})) \mid Y_\text{train}, X_\text{train}]$$ 
    $$\Rightarrow \mathrm{E}[\operatorname{Err}(Y_\text{test}, f(X_\text{test}))$$
 

In [32]:
cross_val_score(dt_model, X, y, cv=5, scoring='accuracy')

array([0.9995, 1.    , 0.9995, 1.    , 1.    ])

**Where does the error come from?**
$$\mathrm{E}[\operatorname{Err}(Y_\text{test}, f(X_\text{test}))=\mathrm{E}\left[(y-\hat{y})^2\right]$$


In [13]:
X_train, y_train = variance_sample_for_decision_tree(sample_size = 40) # we need some variabiliy in the data for bootstrap to work
X_test, y_test = variance_sample_for_decision_tree(sample_size = 2000)

treeclf = DecisionTreeClassifier()
treeclf.fit(X_train, y_train)
y_train_pred = treeclf.predict(X_train)
y_pred = treeclf.predict(X_test)
print('The accuracy on training data is {:.2f}'.format(accuracy_score(y_train, treeclf.predict(X_train))))
print('The accuracy on test data is {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('The test error rate is {:.2f}'.format(1 - accuracy_score(y_test, y_pred)))


The accuracy on training data is 1.00
The accuracy on test data is 0.61
The test error rate is 0.39


## Bias & Variance
* Question: What does the term "variance" in the bias-variance tradeoff refer to?
  1.  refer to the variation across different test samples given a fixed model
  2.  refer to the variability of the learning algorithm across different training subsets. 


* Question: What does the term "bias" in the bias-variance tradeoff refer to?
  1.  a model's inability to capture the true relationship
  2.  a real-world problem (which may be complex) is approximated by a simplified model.

* Formal Definition under Regression:
  $$
  \mathrm{E}\left[(y-\hat{y})^2\right]= (f^{*}(x)+\epsilon-\hat{y})^2 = (\mathrm{E}[\hat{y}]-f^{*}(x))^2+\mathrm{E}\left[(\hat{y}-\mathrm{E}[\hat{y}])^2\right]+\sigma^2
  $$
 
  * $ \operatorname{Bias}(\hat{y})=\mathrm{E}[\hat{y}]-f^{*}(x)$
  * $ \operatorname{Var}(\hat{y})=\mathrm{E}\left[(\hat{y}-\mathrm{E}[\hat{y}])^2\right]$
  <!-- * $\sigma^2$: the variance of the noise -->
  * To make it useful, personally, I will assume $f^{*}(x)=y$ 


* High Variance
  * How to measure **Variance** of the learning algorithm?
    * Test Error / Accuracy
    * Generalization Gap between Training Data and Test Data
      <!-- * can suggest overfitting (and thus high variance), it isn't a direct measure of variance itself -->
    * Deviations across the different validation sets via 5-fold CV
    * Variability of the test performance across different bootstrapped training samples
  * Overfitting: a model learns the training data too well, including its noise and specific details that don't generalize to unseen data.
  
* Overfitting v.s. Variance
  * **Overfitting and variance are not the same.** Overfitting is a condition, while variance is a property of the model.
  * Overfitting is often a symptom of high variance, meaning a model that is too complex becomes overly sensitive to the training data specifics and doesn't generalize well to new data. 
  <!-- * Reducing overfitting typically involves reducing variance by techniques like regularization, cross-validation, pruning, or using ensemble methods. -->
  | Setting | Model (1)                          | Training Data (2)                  | Test Data (3)                      |
    |---------|------------------------------------|-----------------------------------|-----------------------------------|
    | 1       | <span style="color:red;">Fix</span>        | <span style="color:red;">Fix</span>         | <span style="color:green;">Vary</span>      |
    | 2       | <span style="color:red;">Fixed</span>   | <span style="color:green;">Vary</span>      | <span style="color:green;">Vary</span>      |

<!-- * The prediction error (light red curves) $\operatorname{Err}_{\mathcal{T}}$ for 100 simulated training sets each of size 50. The lasso was used to produce the sequence of fits.
    <center><img src="pics/bias_variance.png" width="500"></center> -->
<!-- * Actually, the gap between training sample and test sample is commonly larger than the performance gaps between test samples. -->



* High Bias
  *  Concrete Example: A linear model trying to fit a nonlinear dataset would typically have high bias because it cannot capture the nonlinear relationships.
*  Underfitting v.s. Bias
   *  Underfitting happens when a model is too simple to learn the patterns in the training data properly, resulting in poor performance both on training and test data.


In [15]:
# Prepare 5-Fold Cross Validation
kf = KFold(n_splits=30, shuffle=True, random_state=42)
predictions = []

# Train models and collect predictions
print("Train size:", len(X_train))
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    model = DecisionTreeClassifier(criterion="entropy")
    model = RandomForestClassifier(n_estimators=200, criterion="entropy", max_features=None, max_depth=2).fit(X_train, y_train)
    model.fit(X_train_fold, y_train_fold)
    
    # Collect the predicted probabilities for each test instance
    pred = model.predict(X_test)
    predictions.append(pred)


# Convert to a numpy array for easier processing
predictions = np.array(predictions)
# Calculate Variance
variance = np.mean(np.var(predictions, axis=0))
print(f"Variance: {variance}")
# Overfit -> High Vairance
# Decdision  Tree -> 0.109
# Random Forest -> 0.0421

# # Calculate Bias 
# mean_predictions = np.mean(predictions, axis=0) # Expected prediction E[y_hat]
# assert mean_predictions.shape == (len(X_test),)
# bias = np.mean((mean_predictions - y_test) ** 2)
# print(f"Bias: {bias}")



Train size: 40
Variance: 0.04216888888888889
