In [41]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [36]:
datapath = "./zillow-data/merged_new2016.csv"

train_data2016 = pd.read_csv(datapath, index_col=0)

# fix Unamed: 0 at col index 3
fix_col = train_data2016.columns.str.contains('unnamed',case = False)
train_data2016 = train_data2016.drop(train_data2016.columns[fix_col], axis=1)

# drop parcelid (index of properties features)
train_data2016 = train_data2016.drop('parcelid', axis=1)

label = train_data2016.columns[0]
feature = train_data2016.columns[1:]

print("num of features:", len(feature))
print("num of instances:", train_data2016.shape[0])

# modify transactiondate col to keep only the month
new_transactiondate = []
for data in train_data2016['transactiondate']:
    temp = data.split("-")
    new_transactiondate.append(int(temp[1])) # keep only month

new_transactiondate = np.array(new_transactiondate)
new_transactiondate = new_transactiondate.reshape(new_transactiondate.shape[0], 1)

train_data2016['transactiondate'] = new_transactiondate

# training data and label for model 
X_train = train_data2016[feature]
y_train = train_data2016[label]

num of features: 31
num of instances: 90275


In [64]:
# Ridge Regression
regrRidge = linear_model.RidgeCV(alphas=(0.1, 1.0, 10.0), normalize=True, cv=10, 
                                 scoring='neg_mean_squared_error')
regrRidge.fit(X_train, y_train)
y_pred = regrRidge.predict(X_train)

print("Ridge Regression")
print("RidgeCV alpha(lambda):{:.2f}".format(regrRidge.alpha_))
print("Training MSE: {:.6f}".format(mean_squared_error(y_train, y_pred)))
print("Training R^2: {:.6f}".format(r2_score(y_train, y_pred)))



Ridge Regression
RidgeCV alpha(lambda):0.10
Training MSE: 0.025865
Training R^2: 0.003140


In [69]:
# LASSO
regrLasso = linear_model.LassoCV(eps=0.001, n_alphas=100, normalize=True, cv=10, max_iter=50000)
regrLasso.fit(X_train, y_train)
y_pred = regrLasso.predict(X_train)

print("Lasso Regression")
print("LassoCV alpha(lambda): {:.6f}".format(regrLasso.alpha_))
print("Mean squared error: {:.6f}".format(mean_squared_error(y_train, y_pred)))
print("R^2: {:.6f}".format(r2_score(y_train, y_pred)))


# return feature given the coef values from model
def get_reduced_features(coef, features):
    sel_features = []
    for i in range(len(coef)):
        if coef[i] != 0:
            sel_features.append(features[i])
    return sel_features

sel_features = get_reduced_features(regrLasso.coef_, feature)
print("\nSelected Features[{}]:".format(len(sel_features)))
for i in sel_features:
    print(i)

Lasso Regression
LassoCV alpha(lambda): 0.000002
Mean squared error: 0.025873
R^2: 0.002827

Selected Features[13]:
transactiondate
calculatedfinishedsquarefeet
heatingorsystemtypeid
propertylandusetypeid
regionidzip
yearbuilt
landtaxvaluedollarcnt
censustractandblock
haspoolornot
regionidneighborhood
unitcnt
lotsizesquarefeet
finishedfloor1squarefeet
