# DS-SF-26 | Codealong 06 | Introduction to Regression and Model Fit

## Setup

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

In [None]:
def read_dataset():
    return pd.read_csv(os.path.join('..', 'datasets', 'zillow-06-start.csv'), index_col = 'ID')

df = read_dataset()

## Part A1 - Simple Linear Regression

### Transforming Variables, e.g., scaling

Activity: How to scale SalePrice from $ to $M, Size and LotSize from sqft to "1,000 sqft"?

In [None]:
def scale_variables(df):
    # Sale price in $M
    # Size in 1,000 sqft
    # Lot size in 1,000 sqft

    # **TODO**

scale_variables(df)

### `SalePrice` as a function of `Size`

In [None]:
model = smf.ols(formula = 'TODO', data = df).fit()

model.summary()

### Model's parameters

In [None]:
model.params

In [None]:
type(model.params)

In [None]:
model.params['Intercept']

### t-values

In [None]:
model.tvalues

### p-values

In [None]:
model.pvalues

### Confidence Intervals

In [None]:
model.conf_int(cols = [0, 1])

In [None]:
type(model.conf_int(cols = [0, 1]))

## Part A2 - Simple Linear Regression

### `SalePrice` as a function of `Size` without `Intercept`

In [None]:
model = smf.ols(formula = 'TODO', data = df).fit()

model.summary()

### Drop outliers

Activity: How to drop outliers?

In [None]:
def drop_outliers(df):
    print 'Dropping outliers'
    print '- n (before) =', len(df)

    # TODO

    print '- n (after)  =', len(df)
    
drop_outliers(df)

### `SalePrice` as a function of `Size` (again)

In [None]:
model = smf.ols(formula = 'SalePrice ~ Size', data = df).fit()

model.summary()

## Part B - How to check modeling assumptions?

### `.plot_regress_exog()`

In [None]:
figure = plt.figure(figsize = (12, 8))
figure = sm.graphics.plot_regress_exog(model, 'Size', fig = figure)

## Part C1 - How to check normality assumption?

### Histogram (e.g., residuals)

Activity: How to get histograms of residuals?

In [None]:
model = smf.ols(formula = 'SalePrice ~ Size', data = df).fit()

# TODO

Is it normal?

### q-q plot (e.g., residuals) against a normal distribution

In [None]:
figure, ax = plt.subplots(figsize = (8, 8))
figure = sm.qqplot(model.resid, line = 's', ax = ax)

plt.show()

## Part C2 - How to check normality assumption?

### q-q plot of two normal distributions

`.qqplot()` with `line = 's'`

In [None]:
normal_array = np.random.normal(0, 1, size = 100)

figure, ax = plt.subplots(figsize = (8, 8))
figure = sm.qqplot(normal_array, line = '45', ax = ax)
plt.show()

`.qqplot()` with `line = '45'`; N(0, 1) vs. N(0, 1) 

In [None]:
normal_array = np.random.normal(10, 1, size = 100)

figure, ax = plt.subplots(figsize = (8, 8))
figure = sm.qqplot(normal_array, line = '45', ax = ax)
plt.show()

`.qqplot()` with `line = 's'`; N(10, 1) vs. N(0, 1) 

In [None]:
normal_array = np.random.normal(10, 1, size = 100)

figure, ax = plt.subplots(figsize = (8, 8))
figure = sm.qqplot(normal_array, line = 's', ax = ax)
plt.show()

`.qqplot()` with `line = '45'`; N(0, 10) vs. N(0, 1) 

In [None]:
normal_array = np.random.normal(0, 10, size = 100)

figure, ax = plt.subplots(figsize = (8, 8))
figure = sm.qqplot(normal_array, line = '45', ax = ax)
plt.show()

## Part D - Inference and Fit

In [None]:
df = read_dataset() # reload the dataset to get our outliers back...

scale_variables(df) # scaling variables (function defined above)

### Effect of outliers on regression modeling - `SalePrice` as a function of `Size`

In [None]:
sns.lmplot('Size', 'SalePrice', df, size = 8)

### `SalePrice` as a function of `Size` after dropping the "worst" outlier

In [None]:
# TODO

sns.lmplot('Size', 'SalePrice', subset_df, size = 8)

## Part E - R<sup>2</sup>

### `SalePrice` as a function of `Size`

In [None]:
model = smf.ols(formula = 'SalePrice ~ 0 + Size', data = df).fit()

model.summary()

### R<sup>2</sup>

In [None]:
model.rsquared

In [None]:
print 'With outliers:'
print '- SalePrice ~     Size; R^2 =', smf.ols(formula = 'SalePrice ~ Size', data = df).fit().rsquared
print '- SalePrice ~ 0 + Size; R^2 =', smf.ols(formula = 'SalePrice ~ 0 + Size', data = df).fit().rsquared

print
drop_outliers(df) # dropping outliers (function defined above)
print

print 'Without outliers:'
print '- SalePrice ~     Size; R^2 =', smf.ols(formula = 'SalePrice ~ Size', data = df).fit().rsquared
print '- SalePrice ~ 0 + Size; R^2 =', smf.ols(formula = 'SalePrice ~ 0 + Size', data = df).fit().rsquared

## Part F - Multiple Linear Regression

In [None]:
df = read_dataset() # reload the dataset to get our outliers back...

scale_variables(df) # scaling variables (function defined above)

In [None]:
model = smf.ols(formula = 'SalePrice ~ Size + BedCount', data = df).fit()

model.summary()

## Part G - Multicollinearity

Yet another way to transform variables: `.apply()`

### Transforming Variables (cont.)

In [None]:
df[ ['SizeLog', 'LotSizeLog'] ] = df[ ['Size', 'LotSize'] ].apply(np.log10)
df[ ['SizeSqrt', 'LotSizeSqrt'] ] = df[ ['Size', 'LotSize'] ].apply(np.sqrt)
df[ ['SizeSquare', 'LotSizeSquare'] ] = df[ ['Size', 'LotSize'] ].apply(np.square)

In [None]:
df

### Multicollinearity

Multicollinearity between Size, ln(Size), sqrt(Size), and Size^2

In [None]:
df[ ['Size', 'SizeLog', 'SizeSqrt', 'SizeSquare' ] ].corr()

In [None]:
model = smf.ols(formula = 'SalePrice ~ Size + SizeLog + SizeSqrt + SizeSquare', data = df).fit()

model.summary()

## Part H - Adjusted R<sup>2</sup>

In [None]:
formula = 'SalePrice ~ 0 + IsAStudio + BedCount + BathCount + Size + LotSize'

model = smf.ols(formula = formula, data = df).fit()

print 'R^2 =', model.rsquared, '(original model)'

In [None]:
x_df = pd.DataFrame(index = df.index)

for i in range(100):
    x = 'X{}'.format(i)
    x_df[x] = np.random.random(len(df))

formula = 'SalePrice ~ 0 + IsAStudio + BedCount + BathCount + Size + LotSize + BuiltInYear + '
formula += ' + '.join(x_df.columns.values)

x_df = x_df.join(df)

x_model = smf.ols(formula = formula, data = x_df).fit()

In [None]:
# TODO