# How to Wear a Suit Like Tan France

![alt text](tan-france-suit.png "Title")

# A Brief Introduction to Underfitting and Overfitting Your Model

In [47]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import statistics as stats
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [20]:
# make data
xs = np.array([i for i in range(20)])
xs = np.reshape(xs, (-1, 1))
ys = xs[:,0] + np.random.default_rng(1985).normal(0, 1, 20)

# fit linear model
reg = linear_model.LinearRegression().fit(xs, ys)
b1 = reg.coef_
b0 = reg.intercept_

# fit polynomial
poly2 = PolynomialFeatures(degree = 2)
xs2 = poly2.fit_transform(xs)
polyreg = linear_model.LinearRegression().fit(xs2, ys)
print(polyreg.coef_)
print(polyreg.predict(xs2))
# produce line
def lineq(x, intc, beta):
    return(intc + beta * x)

y0 = lineq(0, b0, b1)
y1 = lineq(19, b0, b1)

[ 0.          1.042416   -0.00213159]
[-0.34324265  0.69704176  1.73306297  2.764821    3.79231584  4.81554749
  5.83451595  6.84922123  7.85966332  8.86584221  9.86775792 10.86541045
 11.85879978 12.84792592 13.83278888 14.81338865 15.78972523 16.76179862
 17.72960883 18.69315584]


In [34]:
# graph it
fig = px.scatter(x=xs[:,0], y= ys, 
                 #trendline = 'ols', 
                labels={'x':'Sales (Units)', 'y':'Revenue (Thousand USD)'})
fig.show()

In [73]:
fig = px.scatter(x=xs[:,0], y= ys, 
                # trendline = 'ols', 
                labels={'x':'Sales (Units)', 'y':'Revenue (Thousand USD)'})
fig.add_trace(go.Scatter(x = [0, 19], y = [ys.mean(), ys.mean()], showlegend=False, mode = 'lines', 
                        line=go.scatter.Line(color="blue")))
fig.show()

print(stats.variance(ys))
stats.mean(np.array(ys))
sum(np.square((np.array(ys) - stats.mean(np.array(ys))))) / (19)

36.31022733596737


36.310227335967376

In [22]:
fig = px.scatter(x=xs[:,0], y= ys, 
                 trendline = 'ols', 
                labels={'x':'Sales (Units)', 'y':'Revenue (Thousand USD)'})
fig.show()

## $R^2 = 0.967$

## Can we do better?

In [42]:
# graph it
fig = px.scatter(x=xs[:,0], y= ys, 
                 #trendline = 'ols', 
                labels={'x':'Sales (Units)', 'y':'Revenue (Thousands USD)'})
fig.add_trace(go.Scatter(x=xs[:,0], y= ys, mode = 'lines', showlegend = False, 
                        line=go.scatter.Line(color="blue")))
fig.show()

## The Perfect Fit!

$f: [0, 20] \cap \mathcal{N} \rightarrow \mathcal{R}$ by

- 0 $\rightarrow$  0.6120
- 1 $\rightarrow$ -0.3243
- 2 $\rightarrow$ 0.9824
- ...
- 18 $\rightarrow$ 18.0297
- 19 $\rightarrow$ 20.6768

## Compare to Linear Model:

$f: \mathcal{N} \cup \{0\}\rightarrow \mathcal{R}$ by 

$Revenue = f(Units) = -0.22 + 1.00 * Units$


## Should You Choose The Perfect Fit??

### __It Depends__: 

- __NO__, if the details of your data largely reflect noise (random variation) 
- __YES__, if they reflect signal (important features of the universe)

If NO, the model is __overfit__, mistaking noise for signal. 

In [None]:
# make data
xs = np.array([i for i in range(20)]) # 0 to 19 units
xs = np.reshape(xs, (-1, 1))
ys = xs[:,0] + np.random.default_rng(1985).normal(0, 1, 20) # 0 to 19 Revenue + randomness

# True "model": Revenue = 0 + 1 * Units

In [31]:
fig = px.scatter(x=xs[:,0], y= ys, 
                trendline = 'ols', 
                labels={'x':'Sales (Units)', 'y':'Revenue (Thousand USD)'})
fig.show()

In [32]:
# graph it
fig = px.scatter(x=xs[:,0], y= ys, 
                 #trendline = 'ols', 
                labels={'x':'Sales (Units)', 'y':'Revenue (Thousands USD)'})
fig.add_trace(go.Scatter(x=xs[:,0], y= ys, mode = 'lines', showlegend = False))
fig.show()


__Overfitting__: When a model mistakes noise for signal OR fits too much noise

### Suit Analogy

__Goal__: to tailor a suit that shows what humans look like *in general*.

This suit shows details of this particular human. Not humans in general. 

![alt text](overfit-suit.jpg "Title")

__Underfitting__: When a model fits too little signal

### Suit Analogy

__Goal__: to tailor a suit that shows what humans look like *in general*.

This suit fails to show sufficient features of human form. 

![alt text](underfit-suit.png "Title")

### Good Fit: Suit Tailored With Impeccable Style
![alt text](tan-france-suit.png "Title")

In [29]:
x = [i for i in range(80)]
y = x.copy()
y[20] = 250

fig = px.scatter(x = x, y = y)
fig.show()

In [30]:
fig = px.scatter(x = x, y = y)
fig.add_trace(go.Scatter(x = x, y = y))
fig.show()

## Should You Choose The Perfect Fit??

__Ask Yourself__: What would new data look like? Will a sales of 2 units lead to *exactly* 0.9824 thousand dollars in Revenue?

__If Yes__: Great!

__If No__: Consider 


In [9]:
# graph it
ys[10] = 35 # oops!
fig = px.scatter(x=xs[:,0], y= ys, 
                 trendline = 'ols', 
                labels={'x':'Sales', 'y':'Revenue'})
fig.add_trace(go.Scatter(x=xs[:,0], y= ys, mode = 'lines', showlegend = False))
fig.show()

In [None]:
xy = np.random.default_rng(seed=1986).multivariate_normal([0,0], [[1, .98], [.98, 1]], 20)

reg = linear_model.LinearRegression()
reg.fit(xy[:,0:1], xy[:,1:2])
reg.coef_
reg.intercept_



y1 = lineq(-4, reg.intercept_, reg.coef_[0])[0]
y2 = lineq(4, reg.intercept_, reg.coef_[0])[0]



In [None]:
#xy[9, 1] = 5
#fig = px.scatter(x=xy[:,0], y=xy[:,1], 
#                labels={'x':'Sales', 'y':'Revenue'})
fig = px.scatter()
fig.add_trace(go.Scatter(x=xy[:,0], y=xy[:,1], mode = 'markers', showlegend = False))
fig.add_trace(go.Scatter(
        x=[-4, 4],
        y=[y1, y2],
        mode="lines",
        line=go.scatter.Line(color="blue"),
        showlegend=False
)
             )
fig.show()

In [None]:
df = pd.DataFrame(xy)

In [None]:
fig = px.scatter(x=xy[:,0], y=xy[:,1], 
                 trendline = 'ols', 
                labels={'x':'Sales', 'y':'Revenue'})
fig.add_scatter(x = 0, y = 5)
fig.show()

![alt text](baggy_clothes.png)

# The end