# DS-SF-27 | Codealong 07 | Linear Regression and Model Fit, Part 2

In [None]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import statsmodels.api as sm
import statsmodels.formula.api as smf

# TODO

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns

## Part A - One-hot encoding for categorical variables

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'zillow-07.csv'), index_col = 'ID')

In [None]:
df.drop(df[df.IsAStudio == 1].index, inplace = True)

In [None]:
smf.ols(formula = 'SalePrice ~ BathCount', data = df).fit().summary()

> ### What's the bathrooms' distribution in the dataset?

In [None]:
# TODO

> ### Let's keep properties with 1, 2, 3, or 4 bathrooms

In [None]:
# TODO

> ### Let's use `pandas`'s `get_dummies` to create our one-hot encoding

In [None]:
# TODO

In [None]:
baths_df

In [None]:
baths_df.rename(columns = {'Bath_1.0': 'Bath_1',
                           'Bath_2.0': 'Bath_2',
                           'Bath_3.0': 'Bath_3',
                           'Bath_4.0': 'Bath_4'}, inplace = True)

In [None]:
baths_df

In [None]:
df = df.join([baths_df])

In [None]:
df.columns

## One-hot encoding for categorical variables

> ### `SalesPrice` as a function of `Bath_2`, `Bath_3`, and `Bath_4`

In [None]:
smf.ols(formula = 'SalePrice ~ Size + Bath_2 + Bath_3 + Bath_4', data = df).fit().summary()

> ### `SalesPrice` as a function of `Bath_1`, `Bath_3`, and `Bath_4`

In [None]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_3 + Bath_4', data = df).fit().summary()

> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_4`

In [None]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_4', data = df).fit().summary()

> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_3`

In [None]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_3', data = df).fit().summary()

## Part B - Model's F-statistic

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'zillow-07.csv'), index_col = 'ID')

> ### `SalePrice` as a function of `Size`

In [None]:
# TODO

model.summary()

> ### `SalePrice` as a function of `IsAStudio`

In [None]:
# TODO

model.summary()

### Model's F-value (with significance level of `5%`)

In [None]:
model.fvalue

### Associated p-value

In [None]:
model.f_pvalue

## Part C1 - Linear Regression Modeling with `sklearn`

In [None]:
def summary(X, y, model):
    fvalues, f_pvalues = feature_selection.f_regression(X, y)
    print 'F-statistic (not join but instead done sequentially for each regressor)'
    print '- F-value', fvalues
    print '- p-value', f_pvalues
    print

    print 'R^2 =', model.score(X, y)
    print

    print 'Coefficients'
    print '- beta_0 (Intercept) = {}'.format(model.intercept_)
    for i, coef in enumerate(model.coef_):
        print '- beta_{} ({}) = {}'.format(i + 1, X.columns[i], coef)

> ### Remove samples with `NaN` in `IsAStudio`, `Size`, or `LotSize`

In [None]:
# TODO

### SalePrice ~ IsAStudio with `statsmodels`

In [None]:
smf.ols(formula = 'SalePrice ~ IsAStudio', data = df).fit().summary()

> ### SalePrice ~ IsAStudio with `sklearn`

In [None]:
X = df[ ['IsAStudio'] ]
y = df.SalePrice

# TODO

summary(X, y, model)

### SalePrice ~ Size + LotSize with `statsmodels`

In [None]:
smf.ols(formula = 'SalePrice ~ Size + LotSize', data = df).fit().summary()

> ### SalePrice ~ Size + LotSize with `sklearn`

In [None]:
# TODO

## Part C2 - Linear Regression Modeling with `sklearn` (cont.)

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'advertising.csv'))

In [None]:
df

## Plots

> ### Sales ~ TV

In [None]:
sns.lmplot(x = 'TV', y = 'Sales', data = df)

> ### Sales ~ Radio

In [None]:
sns.lmplot(x = 'Radio', y = 'Sales', data = df)

> ### Sales ~ Newspaper

In [None]:
sns.lmplot(x = 'Newspaper', y = 'Sales', data = df)

## Simple linear regressions

> ### Sales ~ TV

In [None]:
model_tv = smf.ols(formula = 'Sales ~ TV', data = df).fit()

model_tv.summary()

> ### Sales ~ Radio

In [None]:
model_radio = smf.ols(formula = 'Sales ~ Radio', data = df).fit()

model_radio.summary()

> ### Sales ~ Newspaper

In [None]:
model_newspaper = smf.ols(formula = 'Sales ~ Newspaper', data = df).fit()

model_newspaper.summary()

## Residuals

> ### Sales ~ TV

In [None]:
sm.qqplot(model_tv.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model_tv, 'TV')

pass

> ### Sales ~ Radio

In [None]:
sm.qqplot(model_radio.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model_radio, 'Radio')

pass

> ### Sales ~ Newspaper

In [None]:
sm.qqplot(model_newspaper.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model_newspaper, 'Newspaper')

pass

> ### Sales ~ TV + Radio + Newspaper

In [None]:
# TODO

model.summary()

> ### Sales ~ TV + Radio

In [None]:
# TODO

model.summary()

In [None]:
sm.qqplot(model.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

## Part D - Interaction Effects

### Sales ~ TV + Radio + TV * Radio

In [None]:
model = smf.ols(formula = 'Sales ~ TV + Radio + TV * Radio', data = df).fit()

model.summary()

In [None]:
sm.qqplot(model.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV:Radio')

pass