# Section 20: Extensions to Linear Models

## Review: Interpreting Multiple Linear Regression Outputs

In [None]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.formula.api import ols

data = pd.read_csv('auto-mpg.csv')

acc = data['acceleration']
logdisp = np.log(data['displacement'])
loghorse = np.log(data['horsepower'])
logweight= np.log(data['weight'])

scaled_acc = (acc-min(acc))/(max(acc)-min(acc))	
scaled_disp = (logdisp-np.mean(logdisp))/np.sqrt(np.var(logdisp))
scaled_horse = (loghorse-np.mean(loghorse))/(max(loghorse)-min(loghorse))
scaled_weight= (logweight-np.mean(logweight))/np.sqrt(np.var(logweight))

data_fin = pd.DataFrame([])
data_fin['acc']= scaled_acc
data_fin['disp']= scaled_disp
data_fin['horse'] = scaled_horse
data_fin['weight'] = scaled_weight
mpg = data['mpg']
data_fin = pd.concat([mpg, data_fin, data['cylinders'], data['model year'], data['origin']], axis=1)
y = data_fin[['mpg']]a
X = data_fin.drop(['mpg'], axis=1)

In [None]:
formula = 'mpg ~ acceleration+weight+horsepower+displacement'
model = ols(formula=formula, data=data).fit()
model.summary()

### Interpretation

* R-squared and adjusted R-squared
    * R-squared is a measure of how well the model fits the data
    * This model and these independent variables explain 72% of the variance in MPG
    * Adjusted R-squared accounts for more independent variables
* Coefficient
    * Intercept: Coefficient of 45.2511 means that with all variables at 0, MPG would have a prediction of 45.2511
    * Independent variables: A **one unit increase** in the independent variable will lead to a **coefficient amount** increase in the dependent variable
* The Hypothesis (per each independent variable)
    * $H_0$: coefficient = 0
    * $H_A$: coefficient != 0
    * We're looking to see if there's a relationship between each independent variable and the dependent variable
    * **t-statistic** is the t-stat from this t-test of whether the coefficient is equal to zero
    * **p-value** -- a p-value below 0.05 (if alpha = 0.05) is evidence to **reject the null hypothesis** so there is statistical evidence that there is a relationship between independent and dependent variable
    * a p-value above 0.05 is saying that **statistically**, there is no relationship between independent and dependent variable
    * The p-value is the associated p-value of the **t-statistic** -- therefore if the t-statistic is **outside the [0.025-0.975] confidence interval**, the p-value should be below 0.05

## a. Cross Validation

When using train-test split, random samples of data are created for the training and the test set. The problem with this is that the training and test MSE strongly depend on how the training and test sets were created.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
import matplotlib.pyplot as plt
%matplotlib inline

num = 20
train_err = []
test_err = []
for i in range(num):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    linreg.fit(X_train, y_train)
    y_hat_train = linreg.predict(X_train)
    y_hat_test = linreg.predict(X_test)
    train_err.append(mean_squared_error(y_train, y_hat_train))
    test_err.append(mean_squared_error(y_test, y_hat_test))
plt.plot(list(range(num)), train_err, label='Training Error')
plt.plot(list(range(num)), test_err, label='Testing Error')
plt.legend();

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

cv_5_results = cross_val_score(linreg, X, y, cv=10,  scoring='neg_mean_squared_error')
print(cv_5_results)
np.mean(cv_5_results)

In [None]:
cross_val_score(linreg, X, y, scoring='r2', cv=5)

## b. Bias-Variance Trade-Off

> *Underfitting* (bias) happens when a model cannot learn the training data, nor can it generalize to new data.

- Bias arises when wrong assumptions are made when training a model. For example, an interaction effect is missed, or we didn't catch a certain polynomial relationship. Because of this, our algorithm misses the relevant relations between predictors and the target variable. Note how this is similar to underfitting!

> *Overfitting* (variance) happens when a model learns the training data too well. In fact, so well that it is not generalizeable to new data 

- Variance arises  when a model is too sensitive to small fluctuations in the training set. When variance is high, random noise in the training data is modeled, rather than the intended outputs. This is overfitting!

<img src="images/modelfit.png" width="700"> 

## c. Interactions

In [None]:
yr_old = data_fin[:180] # cars from 70 to 75
yr_young = data_fin[180:] # cars from 76 to 82

In [None]:
plt.figure(figsize=(12,7))

regression_1 = LinearRegression()
regression_2 = LinearRegression()

horse_1 = yr_old['horse'].values.reshape(-1, 1)
horse_2 = yr_young['horse'].values.reshape(-1, 1)

regression_1.fit(horse_1, yr_old['mpg'])
regression_2.fit(horse_2, yr_young['mpg'])

# Make predictions using the testing set
pred_1 = regression_1.predict(horse_1)
pred_2 = regression_2.predict(horse_2)

# The coefficients
print(regression_1.coef_)
print(regression_2.coef_)

In [None]:
# Plot outputs
plt.figure(figsize=(10,6))

plt.scatter(horse_1, yr_old['mpg'],  color='blue', alpha = 0.3, label = 'older cars')
plt.scatter(horse_2, yr_young['mpg'],  color='red', alpha = 0.3, label = 'younger cars')

plt.plot(horse_1, pred_1,  color='blue', linewidth=2)
plt.plot(horse_2, pred_2,  color='red', linewidth=2)

plt.ylabel('mpg')
plt.xlabel('horsepower')
plt.legend();

In [None]:
# let's first look at the baseline model, one without the interaction
from sklearn.model_selection import KFold
regression = LinearRegression()
crossvalidation = KFold(n_splits=3, shuffle=True, random_state=1)

baseline = np.mean(cross_val_score(regression, X, y, scoring='r2', cv=crossvalidation))
baseline

In [None]:
regression = LinearRegression()
crossvalidation = KFold(n_splits=3, shuffle=True, random_state=1)

X_interact_2 = X.copy()
X_interact_2['horse_year'] = X['horse'] * X['model year']

interact_horse_origin = np.mean(cross_val_score(regression, X_interact_2, y, scoring='r2', cv=crossvalidation))
interact_horse_origin

In [None]:
X_interact_2 = sm.add_constant(X_interact_2)
model = sm.OLS(y,X_interact_2)
results = model.fit()

results.summary()

## d. Polynomial Regression

In [None]:
yld = pd.read_csv('yield.csv', sep='\s+', index_col=0)
display(yld.head())
plt.scatter(yld['Temp'], yld['Yield'], color='green')
plt.xlabel('Temperature')
plt.ylabel('Yield');

In [None]:
y = yld['Yield']
X = yld.drop(columns='Yield', axis=1)

reg = LinearRegression().fit(X, y)
plt.scatter(X, y, color='green')
plt.plot(X, reg.predict(X))
plt.xlabel('Temperature')
plt.ylabel('Yield');

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

print(np.sqrt(mean_squared_error(y, reg.predict(X))))
print(r2_score(y, reg.predict(X)))

In [None]:
X['Temp_sq'] = X['Temp']**2
X.head()

In [None]:
reg_q = LinearRegression().fit(X, y)
plt.scatter(X['Temp'], y, color='green')
plt.plot(X['Temp'], reg_q.predict(X))
plt.xlabel('Temperature')
plt.ylabel('Yield')
plt.show()

In [None]:
print(np.sqrt(mean_squared_error(y, reg_q.predict(X))))
print(r2_score(y, reg_q.predict(X)))

In [None]:
X = sm.add_constant(X)
model = sm.OLS(y,X)
results = model.fit()

results.summary()