In [None]:
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd

# Underfitting and overfitting



## Decision tree classifier

In [None]:
titanic = pd.read_csv('titanic.csv')

titanic = titanic.drop(columns=['Name']) # drop the column 'Name'
is_F = (titanic['Sex']=='female') # array of True and False
titanic['Sex'] = is_F.astype(int) # 1 = female, 0 = male
train = titanic.sample(frac=0.8) # 80% rows for training
test = titanic.drop(index=train.index)

y_train = train['Survived']
X_train = train.drop(columns=['Survived'])
print(X_train.shape, y_train.shape)

y_test = test['Survived']
X_test = test.drop(columns=['Survived']) 
print(X_test.shape, y_test.shape)

### Let's fit two trees: one with `max_depth=2` and the other with `max_depth=20`. 

In [None]:
from sklearn import tree

T2 = tree.DecisionTreeClassifier(max_depth=2)
T20 = tree.DecisionTreeClassifier(max_depth=20)

T2.fit(X_train, y_train)
T20.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots(1, figsize = (10, 10))
p = tree.plot_tree(T2, 
                   filled=True, 
                   feature_names=X_train.columns)

In [None]:
fig, ax = plt.subplots(1, figsize = (10, 10))
p = tree.plot_tree(T20, 
                   filled=True, 
                   feature_names=X_train.columns)

### Evaluate depth 2 tree and depth 20 tree

We see that the training accuracy of the depth-2 tree is lower than that of the depth-20 tree, while the test accuracy of the depth-2 tree is higher.

In [None]:
print('max_depth 2')
print('Train score:',T2.score(X_train, y_train))
print('Test score:',T2.score(X_test, y_test))
print()
print('max_depth 20')
print('Train score:',T20.score(X_train, y_train))
print('Test score:',T20.score(X_test, y_test))

### Investigate how the tree's `max_depth` affects training and test performance by varying the depth from 1 to 30.

As the depth of the tree increases from 1 to 30, training accuracy (in blue) consistently improves and eventually reaches 100%, indicating that deeper trees can perfectly fit the training data. In contrast, test accuracy (in orange) initially increases slightly but then fluctuates and generally declines as depth increases. This figure illustrates overfitting: deeper trees capture noise in the training data, resulting in reduced generalization performance on unseen data.

In [None]:
depths = range(1, 31)
train_scores = []
test_scores = []

for depth in depths:
    T = tree.DecisionTreeClassifier(max_depth=depth, criterion='gini')
    T.fit(X_train, y_train)
    train_scores.append(T.score(X_train, y_train))
    test_scores.append(T.score(X_test, y_test))

fig, ax = plt.subplots(1)
sns.scatterplot(x=depths, y=train_scores, label='train')
sns.scatterplot(x=depths, y=test_scores, label='test')
ax.set_xlabel('Depth of tree')
ax.set_ylabel('Accuracy')


## Polynomial regression

Let's simulate the data points for this example. We assume the following linear relationship: $Y=X+1 + 0.2\epsilon$, where $\epsilon \sim \mathcal{N}(0,1)$. 

In [None]:
# controls random number generation
# always get the same data
np.random.seed(1234) 

# true model is linear with a = 1 and b = 1
a = 1
b = 1

n_points = 100

X = np.random.rand(n_points)
Y = a*X + b + 0.2*np.random.randn(n_points) # final term is random noise

In [None]:
fig, ax = plt.subplots(1)

ax.plot([0,1], [1, 2], color = "black", label = "true model")
ax.scatter(X, Y, label = "data")
ax.set(xlabel='X', ylabel='Y')
plt.legend()

### Fit the model 

train test split

In [None]:
df = pd.DataFrame(data={'Y': Y, 'X': X})
train = df.sample(frac=0.8) # 80% rows for training
test = df.drop(index=train.index) # rest of rows for testing
print(train.shape, test.shape)

In [None]:
y_train = train['Y']
X_train = train.drop(columns=['Y'])
print(X_train.shape, y_train.shape)

y_test = test['Y']
X_test = test.drop(columns=['Y']) 
print(X_test.shape, y_test.shape)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

#### Fit two polynomial regression models to the training data: one using a degree-1 polynomial (simple linear model) and the other using a degree-20 polynomial (more complex model).

In [None]:
model1 = PolynomialRegression(1)
model1.fit(X_train, y_train)
model2 = PolynomialRegression(20)
model2.fit(X_train, y_train)

In [None]:
prediction_inputs = pd.DataFrame(data={'X': np.linspace(0.01, 1, 1000)})

fig, ax = plt.subplots(1)

ax.plot(prediction_inputs, model1.predict(prediction_inputs), color = "red", label = "linear")
ax.plot(prediction_inputs, model2.predict(prediction_inputs), color = "green", label = "degree 20")

ax.scatter(X_train, y_train, marker='*', label = "train data")
ax.scatter(X_test, y_test, label = "test data")

ax.set(xlabel='X', ylabel='Y')
plt.legend()

Model 1 fits a simple straight line to the data. It doesn't perfectly capture the training data, but it generalizes reasonably well to test data.

Model 2 fits a very complex curve, so it does better on the training set than the simple linear model. But it overfits â€” it captures the noise in the training data rather than the true underlying pattern. Therefore, its predictions on the test data are worse than just predicting the mean, as we can see from the negative R-squared. 

In [None]:
print('model 1 with degree 1 (linear)')
print('Train score:',model1.score(X_train, y_train))
print('Test score:',model1.score(X_test, y_test))
print()
print('model 2 with degree 20')
print('Train score:',model2.score(X_train, y_train))
print('Test score:',model2.score(X_test, y_test))


In [None]:
degrees = range(1, 31)
train_scores = []
test_scores = []

for degree in degrees:
    lr = PolynomialRegression(degree).fit(X_train, y_train)
    train_scores.append(lr.score(X_train, y_train))
    test_scores.append(lr.score(X_test, y_test))

fig, ax = plt.subplots(1)
sns.scatterplot(x=degrees, y=train_scores, label='train')
sns.scatterplot(x=degrees, y=test_scores, label='test')
ax.set_xlabel('Degree of polynomial regression')
ax.set_ylabel('$R^2$ score')

## (Overly) simple example from lecture slides

The code below does some model fitting for the example described in class (for Rick Marks section).  This not enough data to be confident about any kind of model!  It is intentionally simple so as to illustrate potential issues of fitting models.

In [None]:
# create a table of the data shown in the slides in class
df=pd.DataFrame({
    'Day': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun','Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'],
    'Value': [190, 189.4, 188.8, 188.2, 187.5, 187, 187.8, 187.2, 186.7, 186.2, 185.8, 185.4, 185],
    'Value Change': [float('nan'), -0.6, -0.6, -0.6, -0.7, -0.5, 0.8, -0.6, -0.5, -0.5, -0.4, -0.4, -0.4]})
df=df.reset_index() # add the index as a column (feature)
df  

In [None]:
# looks a lot like the slide
sns.scatterplot(df,x='index',y='Value')
plt.ylim(175, 200)

In [None]:
# look at just the change in the value.  Note the potential outlier.
sns.scatterplot(df,x='index',y='Value Change')


In [None]:
# visualize the change in value by day
sns.boxplot(df,x='Day',y='Value Change')


## Linear regression

### Predict Value based on index

What if we try a simple linear regression model to predict Value, using the index? 

In [None]:
X1 = df.drop(columns=['Value Change','Value','Day'])
y1 = df['Value']

from sklearn.linear_model import LinearRegression
LM1 = LinearRegression(fit_intercept=True)

LM1.fit(X1, y1)
LM1.score(X1, y1)


In [None]:
# visualize the result
fig, ax = plt.subplots(1)
ax.plot(df['index'], LM1.predict(X1), color = "red", label = "linear regression")
ax.scatter(X1, y1, marker='*', label = "train data")
plt.ylim(175, 200)
plt.legend()

In [None]:
LM1.predict(pd.DataFrame({'index': [13]}))

### Predict Value Change based on index

What if instead we used simple linear regression to prediction the change in value?

In [None]:
df_clean=df.dropna().copy() # this will drop the first row because there is no value change for that sample
df_clean 

In [None]:
X2 = df_clean.drop(columns=['Value Change','Value','Day'])
y2 = df_clean['Value Change']

from sklearn.linear_model import LinearRegression
LM2 = LinearRegression(fit_intercept=True)

LM2.fit(X2, y2)
LM2.score(X2, y2) # as you see below, not a very good fit score (mostly due to value change for index 6)

In [None]:
fig, ax = plt.subplots(1)
ax.plot(df_clean['index'], LM2.predict(X2), color = "red", label = "linear regression")
plt.scatter(X2, y2, marker='*', label = "train data")
plt.legend()

In [None]:
LM2.predict(pd.DataFrame({'index': [13]}))

## Decision tree

Now let's try a decision tree on this data instead.

### Predict Value Change direction based on index

In [None]:
df_clean_tree = df_clean.copy()
# convert the value change to a categorical variable
df_clean_tree['Value Change direction'] = 'no change'

#change df_clean_tree['Value Change cat'] to increase or decrease
df_clean_tree.loc[df_clean_tree['Value Change']<0.0, 'Value Change direction'] = 'decrease'  
df_clean_tree.loc[df_clean_tree['Value Change']>0.0, 'Value Change direction'] = 'increase'

df_clean_tree


In [None]:
Xt1 = df_clean_tree.drop(columns=['Value Change','Value','Day','Value Change direction'])
yt1 = df_clean_tree['Value Change direction']
from sklearn import tree
T1 = tree.DecisionTreeClassifier(max_depth=2)
T1.fit(Xt1, yt1)
fig, ax = plt.subplots(1, figsize = (8, 8))
p = tree.plot_tree(T1, 
                   filled=True, 
                   feature_names=Xt1.columns)


In [None]:

T1.predict(pd.DataFrame({'index': [13]}))

### Predict Value Change direction based on Day of the week

In [None]:
# convert the Day to a numeric variable so we can use it in the decision tree
# this is not 
df_clean_tree['Day numeric'] = df_clean_tree['Day'].map({'Mon': 0, 'Tue': 1, 'Wed': 2, 'Thu': 3, 'Fri': 4, 'Sat': 5, 'Sun': 6})
df_clean_tree


In [None]:
Xt2 = df_clean_tree.drop(columns=['Value Change','Value','Day','Value Change direction','index'])
yt2 = df_clean_tree['Value Change direction']
from sklearn import tree
T2 = tree.DecisionTreeClassifier(max_depth=2)
T2.fit(Xt2, yt2)
fig, ax = plt.subplots(1, figsize = (8, 8))
p = tree.plot_tree(T2, 
                   filled=True, 
                   feature_names=Xt2.columns)

In [None]:
T2.predict(pd.DataFrame({'Day numeric': [6]}))

# Predicting the temperament of ROUSes using k-NN classification
Using other data we have in the table, we want to predict the temperament of ROUSes.

In [None]:
rouses = pd.read_csv('ROUSes.csv')
print(rouses.shape)
rouses.head()

### Exploratory analysis
First, let's look at a scatterplot with the temperament represented as color and symbols to get a general idea of the data.

In [None]:
sns.scatterplot(data=rouses, x='Age',y='Length', hue='Temperament', style='Temperament')

As you can see, there are some clusters of the same temperament, which means the samples have the same temperament as their neighbors, so k-NN should work well for those.  But there are also definitely some samples are more "alone" so k-NN won't be as good for prediction.

In [None]:
rouses.describe()

### Normalize columns
The next cell normalizes the columns so the neighbor distance calculations will be scaled equivalently.  You can try skipping this cell to see the performance without scaling.

In [None]:
# First, try skipping this cell and see results without scaling
rouses['Age'] = (rouses['Age']-rouses['Age'].min())/( rouses['Age'].max()-rouses['Age'].min()) # normalize 'Age' columns
rouses['Length'] = (rouses['Length']-rouses['Length'].min())/( rouses['Length'].max()-rouses['Length'].min()) # normalize 'Length' columns
rouses['Weight'] = (rouses['Weight']-rouses['Weight'].min())/( rouses['Weight'].max()-rouses['Weight'].min()) # normalize 'Weight' columns

rouses.head()

Okay, as usual let's follow the train and test process:

In [None]:
train = rouses.sample(frac= 0.8, random_state=1234) # 80% rows for training
test = rouses.drop(index=train.index) # rest of rows for testing
print(train.shape, test.shape)

In [None]:
y_train = train['Temperament']
X_train = train.drop(columns=['Temperament'])
print(X_train.shape, y_train.shape)

y_test = test['Temperament']
X_test = test.drop(columns=['Temperament']) 
print(X_test.shape, y_test.shape)

Notice that is a very small number of train and test samples, so our results are going to be highly dependent on how the data is split.  Let's try k-NN classification:

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print('Train score:',knn.score(X_train, y_train))
print('Test score:',knn.score(X_test, y_test))


In [None]:
print(X_test)
print('Prediction:',knn.predict(X_test))
print('Actual:',list(y_test))

Well, predicting 4 out of 6 isn't great, but it is actually better than expected considering the Train score.  Predicting temperament is a pretty tough challenge!  Try different values for k (`n_neighbors`) to see what changes.

# Predicting the weight of ROUSes using k-NN
Using other data we have in the table, we want to predict the weight of ROUSes.

In [None]:
rouses = pd.read_csv('ROUSes.csv')
print(rouses.shape)
rouses.head()

In [None]:
sns.scatterplot(data=rouses, x='Length',y='Weight')

In our previous linear regression work, we were able to use `Age` to predict `Weight` quite well because the correlation was close to linear.  Let's try using `Length` instead, and see how well we can predict despite the relationship being less linear.

In [None]:
rouses = rouses.drop(columns=['Temperament','Age']) # drop the columns 'Temperament' and 'Age'
rouses.head()

Train and test!

In [None]:
train = rouses.sample(frac= 0.8, random_state=4321) # 80% rows for training
test = rouses.drop(index=train.index) # rest of rows for testing
print(train.shape, test.shape)

The next thing to do is to separate out the target data `Weight` from the predictor data (everything else; in this case just `Length` is left).

In [None]:
y_train = train['Weight']
X_train = train.drop(columns=['Weight'])
print(X_train.shape, y_train.shape)

y_test = test['Weight']
X_test = test.drop(columns=['Weight']) 
print(X_test.shape, y_test.shape)

Okay, let's try using weighted K-NN for regression:

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5,weights='distance')
knn.fit(X_train, y_train)
print('Train score:',knn.score(X_train, y_train))
print('Test score:',knn.score(X_test, y_test))


For regression a "score" (the R2 value) near 1 is what we are hoping for, and 0 is the worst result.  So our model is doing a very good job at predicting the data!

To visualize, we can plug in the test values in and have their outputs predicted:

In [None]:
predictions = knn.predict(X_test)

fig, ax = plt.subplots(1)
sns.scatterplot(train,x='Length', y='Weight', label = "training data")
ax.scatter(test['Length'], predictions, label = "test")
ax.set(xlabel='Length', ylabel='Weight')
plt.legend()

Indeed, it looks like these predictions follow the trend of the data! You can try different values for k to see how the results change. 