In [1]:
#import necessary statements
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import statsmodels.api as sm
from typing import List
from scipy.stats import normaltest, stats

In [44]:
# gets data from SQl databases joining multiple tables allowing to set range of home price
def pull_data(dbname = 'Seattle_housing', low_limit = '200000', high_limit = '2500000', year = 2018):
    """This function takes data from a postgreSQL database and returns a target dataframe and features dataframe."""
    engine = create_engine(f"postgresql:///{dbname}")
    query = f"""SELECT 
               *
            FROM rpsale rp
            JOIN resbldg rd
            ON CONCAT(rp."Major", rp."Minor") = Concat(rd."Major", rd."Minor")
            JOIN parcel p
            ON CONCAT(rp."Major", rp."Minor") = Concat(p."Major", p."Minor")
            WHERE "SalePrice" > {low_limit} AND "SalePrice" < {high_limit}
            AND SUBSTRING("DocumentDate", 7, 4) = '{year}'
         """
    seattle_housing = pd.read_sql(sql = query, con = engine)
    return seattle_housing

In [3]:
#builds our base model in our case Square Feet of House, Water Front Location, and Presensce of a Porch
def build_base(seattle_housing, target = 'SalePrice', basefeatures = ["SqFtTotLiving", "WfntLocation", "SqFtOpenPorch", "SqFtEnclosedPorch", "TrafficNoise", "PowerLines", "OtherNuisances"]):    
    """This function builds the DataFrames that will be used to make the base model from the housing database."""
    
    dep_var = seattle_housing[[target]]
    
    features = pd.DataFrame()
    for feature in basefeatures:
        features = pd.concat([features, seattle_housing[feature]], axis = 1)
    if ("SqFtOpenPorch" in features.columns) and ("SqFtEnclosedPorch" in features.columns):
        features['Porch'] = features['SqFtOpenPorch'] + features['SqFtEnclosedPorch']
        for i, row in features.iterrows():
            if row['Porch'] > 0:
                row['Porch'] = 1
            else:
                row['Porch'] = 0
        features = features.drop(["SqFtOpenPorch", "SqFtEnclosedPorch"], axis = 1)
    else:
        next
    if "WfntLocation" in features.columns:
        for i, row in features.iterrows():
            if row['WfntLocation'] > 0:
                row['WfntLocation'] = 1
            else:
                row['WfntLocation'] = 0
    else:
        next
    
    
    def clean_noise(row: pd.DataFrame) -> int:
        if (row['TrafficNoise'] > 0) or (row['PowerLines'] == 'Y') or (row['OtherNuisances'] == 'Y'):
            return 1
        else:
            return 0
        
    features['Noise'] = features.apply(clean_noise, axis=1)
    features = features.drop(["TrafficNoise", "PowerLines", "OtherNuisances"], axis = 1)
    
    return dep_var, features

In [4]:
# parses df for all continuos variables
def load_con(input_df: pd.DataFrame, con_var: List[str]) -> pd.DataFrame:
    """Loads continuous variables"""
    con_df = input_df[con_var]
    return con_df

In [5]:
#parses df for cateborical variables
def make_ohe(input_df: pd.DataFrame, cat_var: List[str]) -> pd.DataFrame:
    """One Hot Encodes categorical variables"""
    # Load necessary data
    cat_df = input_df[cat_var]
    # Create OHE object
    ohe = OneHotEncoder(categories = 'auto', drop = 'first').fit(cat_df)
    # Create OHE DataFrame
    ohe_df = pd.DataFrame(ohe.transform(cat_df).toarray(), 
                          columns=ohe.get_feature_names(cat_var))
    return ohe_df

In [6]:
# combines all dataframes together
def combine(base: pd.DataFrame, cont: pd.DataFrame, cat: pd.DataFrame) -> pd.DataFrame:
    """Combines all three DataFrames"""
    return pd.concat([base, cont, cat], axis = 1)

In [8]:
# creates a linear regresion model with all values scaled
def ols(dep_var, big_df):
    ss = StandardScaler()
    scaled_features = pd.DataFrame(ss.fit_transform(big_df), columns = big_df.columns)
    model = sm.OLS(dep_var, sm.add_constant(scaled_features)).fit()
    return model

In [47]:
# finds values with high p-values and drop columns with too high corellation matrix
def drop_features(model, big_df, p_val = 0.05, cor_val = 0.80):
    lost_p = []
    lost_cor = []
    for i, p in enumerate(model.pvalues):
        if p > p_val:
            lost_p.append(model.params.index[i])
            #big_df = big_df.drop(model.params.index[i], axis = 1)
            
    # Create correlation matrix
    corr_matrix = big_df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > cor_val)]
    # Drop features 
    big_df = big_df.drop(big_df[to_drop], axis=1)      
            
    edited_df = big_df
    print("""Features with p-values greater than 0.05: 
          """)
    print(len(lost_p)) 
    
    print("""
          
    Lost Features from corellation: """)
    print(len(to_drop))
    
    return edited_df

In [17]:
# returns a list of coefficients in order from greates to least
def coef(model):
    print("""
          
          """)
    #order = model.params.abs().sort_values(ascending = False)
    order = model.params.sort_values(ascending = False)
    
    return order


In [54]:
#builds base model
def base_model():
    print("base model summary")
    seattle_housing = pull_data()
    dep_var, features = build_base(seattle_housing)
    base_model = ols(dep_var, features)
    print(base_model.summary())
    return base_model, dep_var, features

In [55]:
def model(con_var: List[str], cat_var: List[str]) -> None:
    """makes model, gets report, display stats"""
    # gets data from sql
    seattle_housing = pull_data()
    
    #gets base data
    dep_var, features = build_base(seattle_housing)
    
    # adds categorical and continuous variable
    cont = load_con(seattle_housing, con_var)
    cat = make_ohe(seattle_housing, cat_var)
    big_df = combine(features, cont, cat)
    
    #prints updated summary
    print("updated model summary")
    result = ols(dep_var, big_df)
    edited_df = drop_features(result, big_df)
    result_limited = ols(dep_var, edited_df)
    
    # shows variables with highest coefficients
    print("variables with highest and lowest coefficients")
    print(coef(result_limited))
    print (result_limited.summary())
    return result_limited, dep_var, edited_df


In [48]:
def qqplot(model, save = False, filename = "qq_plot.png"):
    """Makes Q-Q Plot from statsmodel OLS model."""
    fig, ax = plt.subplots(figsize = (5, 5), dpi = 150)
    sm.qqplot(model.resid, ax = ax, line = 's')
    ax.set_title('Q-Q Plot of Model')
    fig.tight_layout()
    if save == True:
        fig.savefig(filename, dpi = 300)

In [49]:
def residplot(model, dep_var, save = False, filename = "residual_plot.png"):
    """Makes plot of residuals vs. expected values for a statsmodel OLS model."""
    fig, ax = plt.subplots(figsize = (5,5), dpi = 150)
    sns.residplot(dep_var, model.resid, ax = ax)
    ax.set_title(f'Residual vs. Expected Value \n Normalilty Test: {round(normaltest(model.resid)[0], 2)}, P-Value: {normaltest(model.resid)[1]}')
    ax.set_xlabel('Expected Sale Price')
    ax.set_ylabel('Residual of Sale Price')
    fig.tight_layout()
    if save == True:
        fig.savefig(filename, dpi = 300)

In [50]:
def residdistplot(model, save = False, filename = "residual_dist.png"):
    """Makes a distribution plot of residuals from statsmodel OLS model."""
    fig, ax = plt.subplots(figsize = (7,5), dpi = 150)
    sns.distplot(model.resid, ax = ax)
    ax.set_title(f'Distribution of Residuals \n Normalilty Test: {round(normaltest(model.resid)[0], 2)}, P-Value: {normaltest(model.resid)[1]}')
    ax.set_xlabel('Residual of Sale Price')
    ax.set_ylabel('Frequency of Residual')
    fig.tight_layout()
    if save == True:
        fig.savefig(filename, dpi = 300)

In [51]:
def corrmatrixplot(features, save = False, filename = "correlation_matrix.png"):
    """Makes a correlation matrix heatmap from a pandas DataFrame of features."""
    fig, ax = plt.subplots(figsize = (5, 5), dpi = 150)
    corr = features.corr()
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr, ax = ax, cbar = True, xticklabels= False, cmap = 'Blues', yticklabels= False, square = True, mask = mask)
    ax.set_facecolor(color='white')
    ax.set_title('Features Correlation Matrix')
    fig.tight_layout()
    if save == True:
        fig.savefig(filename, dpi = 300)