In [3]:
import pdb
import pandas

import numpy as np
import scipy

from sklearn.cluster import KMeans
from sklearn import mixture
from matplotlib.patches import Ellipse
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import neural_network

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

import matplotlib.pyplot as plt


# Script Configuration
FILEPATHS = [
    'palo_alto.csv',
    'campbell.csv',
    'gilroy.csv',
    'sunnyvale.csv'
    ]
TRAIN_PERC = 0.75  # percentage of data to use for training


# Combine all the data into one dataframe
dataframes = []
for f in FILEPATHS:
    dataframes.append(pandas.read_csv(f))
df = pandas.concat(dataframes).reset_index(drop=True)

# Dropping unused columns
df = df.drop('street', axis=1)
df = df.drop('last_sold_date', axis=1)
df = df.drop('last_sold_price', axis=1)

#
# For Clustering, we use only 5 features. Filter the rest.
# Bath rooms, Bed rooms, living space, lot size, zillow price
#
featureCols =['bathrooms','bedrooms','finished_sqft' ,'lot_size_sqft' ,'zestimate_valuation_ammount']
XforClustering = df[featureCols].as_matrix()

label_encoders = {}

# Encode string columns to numerical represprentation
for col_name in df.keys():
    if df[col_name].dtype == 'O':
        label_encoders[col_name] = preprocessing.LabelEncoder()
        label_encoders[col_name].fit(df[col_name])
        df[col_name] = label_encoders[col_name].transform(df[col_name])

# Get rid of any rows with NaN or None data
df = df.dropna()

# Shuffle the dataset so we don't train on sorted data
df = shuffle(df)

# Convert to feature and target vectors
train_count = round(len(df) * TRAIN_PERC)
test_count = len(df) - train_count
X = df.drop('zestimate_valuation_ammount', axis=1)
T = df.drop(X.keys(), axis=1)

X_train = X.loc[:train_count-1]
T_train = T.loc[:train_count-1]

X_test = X.loc[train_count:]
T_test = T.loc[train_count:]

# Train the models
models = [
    ('Lasso', linear_model.Lasso(max_iter=100000000)),
    ('LassoLars', linear_model.LassoLars(max_iter=100000000, alpha=.1)),
	('BayesianRidge', linear_model.BayesianRidge()),
]
fList = []
GT = T_train.values.flatten()
# Try out all of our models
for (model_name, model) in models:
    fList.append(model.fit(X_train, T_train.values.flatten()))
    T_predict = model.predict(X_test)
    perc_var = np.median(np.abs(np.divide(T_predict, T_test.values.flatten()) - 1))
    print('%s: Avg. Error: %.2f%%' % (model_name, (perc_var * 100)))

#
# Get some plots to see
#


Lasso: Avg. Error: 20.38%
LassoLars: Avg. Error: 20.38%
BayesianRidge: Avg. Error: 20.37%


In [None]:
lw = 2
plt.figure(figsize=(6, 5))
plt.title("Weights of the model")
plt.plot(fList[0].coef_, color='lightgreen', linewidth=lw,
         label="Lasso estimate")
#plt.plot(GT, color='gold', linewidth=lw, label="Ground truth")
plt.plot(fList[1].coef_, color='navy', linestyle='--', label="Lassolars estimate")
plt.plot(fList[2].coef_, color='red', linestyle='--', label="BayesianRidge estimate")
plt.xlabel("Features")
plt.ylabel("Values of the weights")
plt.legend(loc="best", prop=dict(size=12))
plt.show()

plt.plot(fList[0].predict(X_test), color='navy', linestyle=':', label="Lasso estimate")
plt.plot(T_test.values.flatten(), color='yellow', linestyle='-', label="Ground Truth")
plt.xlabel("Number of houses")
plt.ylabel("\$\$ Value * 10e-6")
plt.legend(loc="best", prop=dict(size=12))
plt.show()

#
# Use Gaussian mixture
#
g = mixture.GaussianMixture(n_components=5, n_init=1, reg_covar=1e-06, \
                            tol=0.001, max_iter=100, init_params='kmeans', \
                            warm_start=False, covariance_type='full')
gf = g.fit(XforClustering)
#
# These are measures to view. Not used in further computation
#
weights = np.round(gf.weights_,5)
means   = np.round(gf.means_, 5)
covars  = np.round(gf.covariances_, 5)
print "Means: ", means
#
# Prediction
#
predicts = g.predict(XforClustering)
plt.xlabel('Height in cm')
plt.ylabel('Handspan in cm')
#plt.text(148,12, clabel)
#plt.text(183,22, mlabel)
#plt.text(160,18, flabel)
plt.show()