In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [9]:
class Markov:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.model = LinearRegression()
        self.model.fit(X, y)
        self.predictions = self.model.predict(X)
        self.errors = self.y - self.predictions
        self.X_const = sm.add_constant(X)
        self.lm_results = sm.OLS(y, self.X_const).fit()
        self.coefs = self.model.coef_
        self.intercepts = self.model.intercept_

    def plot_linearity(self):
        count = 1
        plt.figure(figsize=(25, 15))
        for col in self.X.columns:
            plt.subplot(len(cols) / 3, len(cols) / 3, count)
            plt.scatter(self.X[col], self.predictions)
            plt.xlabel(col)
            plt.ylabel("target")
            count += 1

        plt.tight_layout()
        plt.show()

    def plot_homoscedasticity(self):
        plt.scatter(self.predictions, self.errors)
        plt.xlabel("Predicted")
        plt.ylabel("Residual")
        plt.axhline(y=0)
        plt.title("Residual vs. Predicted")
        plt.show()

    def b_pagan(self):
        _, lmp, _, fp = het_breuschpagan(lm_results.resid, X)

        return lmp, fp

    def get_vifs(self):
        vifs = []
        for i in range(X_const.shape[1]):
            vif = variance_inflation_factor(X_const.values, i)
            vifs.append(vif)

        return pd.Series(vifs, index=X_const.columns)

    def plot_errors(self):
        plt.plot(self.errors)
        plt.show()

    def plot_errors_acf(self):
        acf_data = acf(self.errors)

        plt.plot(acf_data[1:])
        plt.show()

    def plot_error_normality(self):
        qqplot(lm_results.resid, line="s")
        plt.show()

        plt.hist(lm_results.resid)
        plt.show()

    def shapiro_wilkes(self):
        return stats.shapiro(self.lm_results.resid)

<IPython.core.display.Javascript object>

In [12]:
def clean_house_prices_df(house_prices_df):
    house_prices_df = house_prices_df.set_index("id")

    # Log transfrom dataframe
    log_df = house_prices_df.copy()
    log_df["log_saleprice"] = np.log(log_df["saleprice"])
    log_df = log_df.drop(columns=["saleprice"])

    # Keep only top correlated columns
    corr_df = log_df.corr()[["log_saleprice"]]
    corr_df.columns = ["corr"]
    corr_df["abs_corr"] = corr_df.abs()
    top_corrs = corr_df.sort_values("abs_corr", ascending=False).head(10)
    num_cols = log_df.select_dtypes("number").columns
    keep_cols = top_corrs.index
    drop_cols = [c for c in num_cols if c not in keep_cols]
    log_df = log_df.drop(columns=drop_cols)
    log_df

    drop_cols = ["poolqc", "alley", "fence", "fireplacequ", "miscfeature"]
    log_df = log_df.drop(columns=drop_cols)
    log_df = log_df.dropna()

    cat_cols = log_df.select_dtypes("O").copy()
    keep_cols = ["exterqual", "bsmtqual", "kitchenqual", "centralair"]
    drop_cols = [c for c in cat_cols.columns if c not in keep_cols]

    log_df = log_df.drop(columns=drop_cols)

    # Encode centralair as binary
    log_df["centralair"] = (log_df["centralair"] == "Y").astype(int)

    # Encode qual columns as ordinal
    quality_map = {"Fa": 1, "TA": 2, "Gd": 3, "Ex": 4}
    log_df[["exterqual", "bsmtqual", "kitchenqual"]] = log_df[
        ["exterqual", "bsmtqual", "kitchenqual"]
    ].replace(quality_map)
    
    log_df= log_df.drop(columns=["fullbath", "garagearea", "totalbsmtsf", "exterqual"])

    return log_df


<IPython.core.display.Javascript object>

# Weather Data

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

import warnings

warnings.filterwarnings("ignore")

postgres_user = "dsbc_student"
postgres_pw = "7*.8G9QH21"
postgres_host = "142.93.121.174"
postgres_port = "5432"
postgres_db = "weatherinszeged"

<IPython.core.display.Javascript object>

In [3]:
engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(
        postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db
    )
)
weather_df = pd.read_sql_query("select * from weatherinszeged", con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

<IPython.core.display.Javascript object>

In [4]:
X = weather_df[["humidity", "windspeed"]]
y = weather_df["apparenttemperature"] - weather_df["temperature"]

X = sm.add_constant(X)

results = sm.OLS(y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.288
Model:,OLS,Adj. R-squared:,0.288
Method:,Least Squares,F-statistic:,19490.0
Date:,"Wed, 08 Apr 2020",Prob (F-statistic):,0.0
Time:,19:53:15,Log-Likelihood:,-170460.0
No. Observations:,96453,AIC:,340900.0
Df Residuals:,96450,BIC:,340900.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.4381,0.021,115.948,0.000,2.397,2.479
humidity,-3.0292,0.024,-126.479,0.000,-3.076,-2.982
windspeed,-0.1193,0.001,-176.164,0.000,-0.121,-0.118

0,1,2,3
Omnibus:,3935.747,Durbin-Watson:,0.267
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4613.311
Skew:,-0.478,Prob(JB):,0.0
Kurtosis:,3.484,Cond. No.,88.1


<IPython.core.display.Javascript object>

In [7]:
weather_df["humidity_windspeed_interaction"] = (
    weather_df.humidity * weather_df.windspeed
)

X = weather_df[["humidity", "windspeed", "humidity_windspeed_interaction"]]
y = weather_df["apparenttemperature"] - weather_df["temperature"]


X = sm.add_constant(X)

results = sm.OLS(y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.341
Model:,OLS,Adj. R-squared:,0.341
Method:,Least Squares,F-statistic:,16660.0
Date:,"Wed, 08 Apr 2020",Prob (F-statistic):,0.0
Time:,19:55:48,Log-Likelihood:,-166690.0
No. Observations:,96453,AIC:,333400.0
Df Residuals:,96449,BIC:,333400.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0839,0.033,2.511,0.012,0.018,0.149
humidity,0.1775,0.043,4.133,0.000,0.093,0.262
windspeed,0.0905,0.002,36.797,0.000,0.086,0.095
humidity_windspeed_interaction,-0.2971,0.003,-88.470,0.000,-0.304,-0.291

0,1,2,3
Omnibus:,4849.937,Durbin-Watson:,0.265
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9295.404
Skew:,-0.378,Prob(JB):,0.0
Kurtosis:,4.32,Cond. No.,193.0


<IPython.core.display.Javascript object>

In [8]:
X = weather_df[["humidity", "windspeed", "visibility"]]
y = weather_df["apparenttemperature"] - weather_df["temperature"]


X = sm.add_constant(X)

results = sm.OLS(y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.304
Model:,OLS,Adj. R-squared:,0.303
Method:,Least Squares,F-statistic:,14010.0
Date:,"Wed, 08 Apr 2020",Prob (F-statistic):,0.0
Time:,19:56:04,Log-Likelihood:,-169380.0
No. Observations:,96453,AIC:,338800.0
Df Residuals:,96449,BIC:,338800.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.5756,0.028,56.605,0.000,1.521,1.630
humidity,-2.6066,0.025,-102.784,0.000,-2.656,-2.557
windspeed,-0.1199,0.001,-179.014,0.000,-0.121,-0.119
visibility,0.0540,0.001,46.614,0.000,0.052,0.056

0,1,2,3
Omnibus:,3833.895,Durbin-Watson:,0.282
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4584.022
Skew:,-0.459,Prob(JB):,0.0
Kurtosis:,3.545,Cond. No.,131.0


<IPython.core.display.Javascript object>

# Houses Data

In [14]:
postgres_user = "dsbc_student"
postgres_pw = "7*.8G9QH21"
postgres_host = "142.93.121.174"
postgres_port = "5432"
postgres_db = "houseprices"

<IPython.core.display.Javascript object>

In [15]:
engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(
        postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db
    )
)
house_prices_df = pd.read_sql_query("select * from houseprices", con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

<IPython.core.display.Javascript object>

In [18]:
houses = clean_house_prices_df(house_prices_df)
X = houses.drop(columns=["log_saleprice"])
y = houses["log_saleprice"]
house_markov = Markov(X, y)

<IPython.core.display.Javascript object>

In [19]:
lm_results = house_markov.lm_results
lm_results.summary()

0,1,2,3
Dep. Variable:,log_saleprice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.818
Method:,Least Squares,F-statistic:,666.6
Date:,"Wed, 08 Apr 2020",Prob (F-statistic):,0.0
Time:,20:01:25,Log-Likelihood:,550.32
No. Observations:,1338,AIC:,-1081.0
Df Residuals:,1328,BIC:,-1029.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.2782,0.674,9.311,0.000,4.955,7.601
overallqual,0.0806,0.006,13.883,0.000,0.069,0.092
yearbuilt,0.0009,0.000,3.643,0.000,0.000,0.001
yearremodadd,0.0012,0.000,3.824,0.000,0.001,0.002
bsmtqual,0.0474,0.011,4.365,0.000,0.026,0.069
centralair,0.1884,0.023,8.194,0.000,0.143,0.233
firstflrsf,0.0001,1.44e-05,8.980,0.000,0.000,0.000
grlivarea,0.0002,1.2e-05,18.027,0.000,0.000,0.000
kitchenqual,0.0502,0.010,4.933,0.000,0.030,0.070

0,1,2,3
Omnibus:,841.968,Durbin-Watson:,2.029
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29202.077
Skew:,-2.367,Prob(JB):,0.0
Kurtosis:,25.392,Cond. No.,524000.0


<IPython.core.display.Javascript object>

In [26]:
X = houses.drop(
    columns=["log_saleprice", "bsmtqual", "kitchenqual", "yearremodadd", "yearbuilt"]
)
y = houses["log_saleprice"]
house_markov = Markov(X, y)
lm_results = house_markov.lm_results
lm_results.summary()

0,1,2,3
Dep. Variable:,log_saleprice,R-squared:,0.793
Model:,OLS,Adj. R-squared:,0.792
Method:,Least Squares,F-statistic:,1021.0
Date:,"Wed, 08 Apr 2020",Prob (F-statistic):,0.0
Time:,20:06:34,Log-Likelihood:,461.74
No. Observations:,1338,AIC:,-911.5
Df Residuals:,1332,BIC:,-880.3
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,10.3702,0.029,359.447,0.000,10.314,10.427
overallqual,0.1262,0.005,25.209,0.000,0.116,0.136
centralair,0.2546,0.023,11.034,0.000,0.209,0.300
firstflrsf,0.0001,1.52e-05,8.799,0.000,0.000,0.000
grlivarea,0.0002,1.23e-05,15.625,0.000,0.000,0.000
garagecars,0.1135,0.010,11.938,0.000,0.095,0.132

0,1,2,3
Omnibus:,671.487,Durbin-Watson:,2.057
Prob(Omnibus):,0.0,Jarque-Bera (JB):,12583.29
Skew:,-1.881,Prob(JB):,0.0
Kurtosis:,17.545,Cond. No.,14400.0


<IPython.core.display.Javascript object>