In [1]:
# Ignore SQLITE warnings related to Decimal numbers in the housing database.
import warnings
warnings.filterwarnings('ignore')

In [17]:
# Import Dependencies.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

In [3]:
# Create an engine for the housing.sqlite database.
engine = create_engine("sqlite:///../Resources/housing.sqlite")

In [4]:
# Reflect Database into ORM classes.
Base = automap_base()
Base.prepare(engine, reflect=True)

In [5]:
# Create a database session object.
session = Session(engine)

In [6]:
# Get the table names of the database.
inspector = inspect(engine)
inspector.get_table_names()

['listings']

In [13]:
# Get the names and types of the columns for the table "listings".
columns = inspector.get_columns("listings")
for c in columns:
    print(c["name"], c["type"])

id INTEGER
address VARCHAR(255)
price INTEGER
home_type VARCHAR(255)
bedrooms INTEGER
bathrooms FLOAT
square_feet INTEGER
built INTEGER
neighborhood VARCHAR(255)
county VARCHAR(255)
city VARCHAR(255)
zipcode INTEGER
high_school VARCHAR(255)
middle_school VARCHAR(255)
elementary_school VARCHAR(255)


In [7]:
# Save a reference to the invoices table as "Listings".
Listings = Base.classes.listings

In [23]:
# Create a dataframe to use for our model.
# Query the database for the fields we will use for linear models.
price_return = session.query(Listings.price).all()
price = list(np.ravel(price_return))
bedrooms_return = session.query(Listings.bedrooms).all()
bedrooms = list(np.ravel(bedrooms_return))
bathrooms_return = session.query(Listings.bathrooms).all()
bathrooms = list(np.ravel(bathrooms_return))
square_feet_return = session.query(Listings.square_feet).all()
square_feet = list(np.ravel(square_feet_return))
built_return = session.query(Listings.built).all()
built = list(np.ravel(built_return))

# Construct a dataframe from the returned data.
data_df = pd.DataFrame({"price":price,
                        "bedrooms":bedrooms,
                        "bathrooms":bathrooms,
                        "square_feet":square_feet,
                        "built":built
                       }
                      )

data_df.head()

Unnamed: 0,price,bedrooms,bathrooms,square_feet,built
0,65000,1,1.0,800,1964
1,72000,2,2.0,1152,1988
2,79950,3,2.0,1344,1997
3,79950,3,2.0,1404,1990
4,93900,3,2.0,1297,1997


In [28]:
# Assign X (input) and y (target).

X = data_df[["bedrooms", "bathrooms", "square_feet", "built"]]
y = data_df["price"].values.reshape(-1, 1)

In [20]:
# Split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [21]:
# Create a StandardScater model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

ValueError: could not convert string to float: '0.13ACRES'

In [None]:
# Transform the training and testing data using the X_scaler and y_scaler models.

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
# Create a LinearRegression model and fit it to the scaled training data

model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

In [None]:
# Make predictions using a fitted model.
# Plot the difference between the model predicted values and actual y values, versus the model predicted values.

plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("LinReg Residual Plot")
plt.show()

In [None]:
# Used X_test_scaled, y_test_scaled, and model.predict(X_test_scaled) to calculate MSE and R2.

predictions = model.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# LASSO model, we'll use gridsearch later to try out different alpha values later.

lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train_scaled)

In [None]:
# Make predictions using a fitted lasso model.
# Plot the difference between the model predicted values and actual y values, versus the model predicted values.

plt.scatter(lasso.predict(X_train_scaled), lasso.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(lasso.predict(X_test_scaled), lasso.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("LASSO Residual Plot")
plt.show()

In [None]:
# Used X_test_scaled, y_test_scaled, and lasso.predict(X_test_scaled) to calculate MSE and R2.

lasso_predictions = lasso.predict(X_test_scaled)
lasso_MSE = mean_squared_error(y_test_scaled, lasso_predictions)
lasso_r2 = lasso.score(X_test_scaled, y_test_scaled)

print(f"MSE: {lasso_MSE}, R2: {lasso_r2}")

In [None]:
# Ridge model, we'll use gridsearch later to try out different alpha values later.

ridge = Ridge(alpha=0.1)
ridge.fit(X_train_scaled, y_train_scaled)

In [None]:
# Make predictions using a fitted ridge model.
# Plot the difference between the model predicted values and actual y values, versus the model predicted values.

plt.scatter(ridge.predict(X_train_scaled), ridge.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(ridge.predict(X_test_scaled), ridge.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Ridge Residual Plot")
plt.show()

In [None]:
# Used X_test_scaled, y_test_scaled, and ridge.predict(X_test_scaled) to calculate MSE and R2.

ridge_predictions = ridge.predict(X_test_scaled)
ridge_MSE = mean_squared_error(y_test_scaled, ridge_predictions)
ridge_r2 = ridge.score(X_test_scaled, y_test_scaled)

print(f"MSE: {ridge_MSE}, R2: {ridge_r2}")

In [None]:
# ElasticNet model, we'll use gridsearch later to try out different alpha values later.

elast = ElasticNet(alpha=0.1)
elast.fit(X_train_scaled, y_train_scaled)

In [None]:
# Make predictions using a fitted elasticnet model.
# Plot the difference between the model predicted values and actual y values, versus the model predicted values.

plt.scatter(elast.predict(X_train_scaled), elast.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(elast.predict(X_test_scaled), elast.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("ElasticNet Residual Plot")
plt.show()

In [None]:
# Used X_test_scaled, y_test_scaled, and elast.predict(X_test_scaled) to calculate MSE and R2.

elast_predictions = elast.predict(X_test_scaled)
elast_MSE = mean_squared_error(y_test_scaled, elast_predictions)
elast_r2 = elast.score(X_test_scaled, y_test_scaled)

print(f"MSE: {elast_MSE}, R2: {elast_r2}")