### Processing the data 

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import r2_score

In [13]:
dataset_link = "https://raw.githubusercontent.com/rashida048/Datasets/master/home_data.csv"

# import the dataset
df = pd.read_csv(dataset_link)
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [16]:
# drop id and date 
df_ = df.drop(["id", "date"], axis=1)

# split train and test sets
train_set, test_set = train_test_split(df_, test_size=0.2, random_state=42)

#Create a copy of this training set to play with.
housing = train_set.copy()

# Remove MEDV from the housing dataframe
housing = train_set.drop("price", axis = 1)

# Create a 'labels' dataframe to store the median values
housing_labels = train_set["price"].copy()

In [17]:
#It is good practice to set up a pipeline for transformations to our data. In this case we are only using one, a scaler
#which brings all features on to the similar scale, so they can be directly compared, but we can easily add more if required
pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

#We now apply this pipeline to the housing dataset to create the set we can train our models on
housing_tr = pipeline.fit_transform(housing)

### Train model

#### 1. Linear Regression

In [18]:
#Firstly we will train a basic Multi-Variate Linear regression model
lin_reg = LinearRegression()
lin_reg.fit(housing_tr, housing_labels)

LinearRegression()

In [19]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = pipeline.transform(some_data)

In [20]:
housing_predictions = lin_reg.predict(housing_tr)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

198272.24322458316

In [22]:
r2_score(housing_labels, housing_predictions, multioutput='variance_weighted')

0.699102184301204

#### Random Forest Regressor

In [23]:
rfregr = RandomForestRegressor(max_depth=10, random_state=0)
rfregr.fit(housing_tr, housing_labels)

housing_predictions = rfregr.predict(housing_tr)
rf_mse = mean_squared_error(housing_labels, housing_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

93851.30284876762

In [24]:
r2_score(housing_labels, housing_predictions, multioutput='variance_weighted')

0.9325820090327684

#### Bayesian Regression

In [30]:
import pymc3 as pm

# Context for the model
with pm.Model() as normal_model:
    
    # The prior for the data likelihood is a Normal Distribution
    family = pm.glm.families.Normal()
    
    # Creating the model requires a formula and data (and optionally a family)
    pm.GLM.from_formula(formula, data = housing_tr, family = family)
    
    # Perform Markov Chain Monte Carlo sampling letting PyMC3 choose the algorithm
    normal_trace = pm.sample(draws=2000, chains = 2, tune = 500, njobs=-1)

ModuleNotFoundError: No module named 'pymc3'

In [27]:
r2_score(housing_labels, housing_predictions, multioutput='variance_weighted')

0.6991018399988076