In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso

def PolynomialRegressionModel (X, Y, degree, showTraining=False, regularization="None", alpha=0.5):
    
    trainX, testX, trainY, testY = train_test_split(X, Y, train_size = 0.8, random_state = 1)
    
    poly_feature = PolynomialFeatures(degree)
    polyRegModel = None
    
    if regularization == "L1":
        polyRegModel = Lasso(alpha=alpha)
    elif regularization == "L2":
        polyRegModel = Ridge(alpha=alpha)
    else:
        polyRegModel = LinearRegression()
    
    X_train_poly = poly_feature.fit_transform(trainX)
    X_test_poly = poly_feature.fit_transform(testX)
    
    polyRegModel.fit(X_train_poly, trainY)
    
    print("degree = {}".format(degree))
    
    if showTraining:
        train_pred = polyRegModel.predict(X_train_poly)    
        print("Training MSE =", mean_squared_error(trainY, train_pred))
        print("R-Squared Score:", r2_score(trainY, train_pred))
        
    test_pred = polyRegModel.predict(X_test_poly)
    print("Testing MSE =", mean_squared_error(testY, test_pred))
    print("R-Squared Score:", r2_score(testY, test_pred))

In [8]:
df = pd.read_csv('../austinHousingData.csv')
df_filtered = df[['livingAreaSqFt','numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'MedianStudentsPerTeacher', 'latestPrice']]
dropRows = [705,1399,14275,2316, 2838, 14639, 14376, 14654, 2557]
X = df_filtered.drop(columns = "latestPrice")
Y = df_filtered["latestPrice"]
X = X.drop(dropRows, axis=0)
Y = Y.drop(dropRows, axis=0)


In [9]:
for i in range(2,8):
    PolynomialRegressionModel(X, Y, i, True)

degree = 2
Training MSE = 76975856800.7875
R-Squared Score: 0.5342501350804754
Testing MSE = 84909616528.02756
R-Squared Score: 0.48404638152005597
degree = 3
Training MSE = 69162849652.47954
R-Squared Score: 0.5815234903268763
Testing MSE = 129183973340.98683
R-Squared Score: 0.21501307837260686
degree = 4
Training MSE = 74334863191.1964
R-Squared Score: 0.550229722291875
Testing MSE = 2738442796778.8315
R-Squared Score: -15.64015841517778
degree = 5
Training MSE = 66360853091.43587
R-Squared Score: 0.5984772414645669
Testing MSE = 240804929819.91977
R-Squared Score: -0.4632521022796656
degree = 6
Training MSE = 65861550682.33807
R-Squared Score: 0.6014983189719356
Testing MSE = 12488579935126.58
R-Squared Score: -74.88690504894295
degree = 7
Training MSE = 71825192959.34076
R-Squared Score: 0.5654147247076889
Testing MSE = 36799361917350.03
R-Squared Score: -222.61146729172327


### If do log10 transform:

In [11]:
Y_tr = np.log10(Y)
for i in range(1,8):
    PolynomialRegressionModel(X, Y_tr, i, True)

degree = 1
Training MSE = 0.03184134268510467
R-Squared Score: 0.43163720235048897
Testing MSE = 0.03137194214737487
R-Squared Score: 0.43453864642343076
degree = 2
Training MSE = 0.028741225909728593
R-Squared Score: 0.48697378350280274
Testing MSE = 0.029383387687235252
R-Squared Score: 0.4703812057208112
degree = 3
Training MSE = 0.027423291611199517
R-Squared Score: 0.5104986967716343
Testing MSE = 0.03254434613348047
R-Squared Score: 0.4134067336522187
degree = 4
Training MSE = 0.02720163341647349
R-Squared Score: 0.5144552595624163
Testing MSE = 0.07838891903191217
R-Squared Score: -0.4129155298374865
degree = 5
Training MSE = 0.026911499561640993
R-Squared Score: 0.5196341017694247
Testing MSE = 0.03807655886860951
R-Squared Score: 0.3136917562758065
degree = 6
Training MSE = 0.027987214951528606
R-Squared Score: 0.5004327566968386
Testing MSE = 0.8148747787498858
R-Squared Score: -13.687652846697436
degree = 7
Training MSE = 0.03281953675539852
R-Squared Score: 0.41417659700056

In [14]:
Y_tr = np.log10(Y)
for i in range(1,8):
    PolynomialRegressionModel(X, Y_tr, i, True, "L1", alpha=1.0)

degree = 1
Training MSE = 0.03464720234724435
R-Squared Score: 0.38155306289830304
Testing MSE = 0.03362468716849623
R-Squared Score: 0.3939342030350653
degree = 2
Training MSE = 0.032067894615947716
R-Squared Score: 0.4275932871644912
Testing MSE = 0.03204477520714069
R-Squared Score: 0.4224112144997404


  model = cd_fast.enet_coordinate_descent(


degree = 3
Training MSE = 0.029423026467402406
R-Squared Score: 0.47480375423516885
Testing MSE = 0.029925504577766274
R-Squared Score: 0.4606098581492666


  model = cd_fast.enet_coordinate_descent(


degree = 4
Training MSE = 0.02837718089676018
R-Squared Score: 0.4934719278834375
Testing MSE = 0.028772613911745848
R-Squared Score: 0.48139005446198513


  model = cd_fast.enet_coordinate_descent(


degree = 5
Training MSE = 0.02791100435336626
R-Squared Score: 0.5017931035016345
Testing MSE = 0.029240800982526308
R-Squared Score: 0.4729512497004916


  model = cd_fast.enet_coordinate_descent(


degree = 6
Training MSE = 0.027511005508909864
R-Squared Score: 0.5089330179374087
Testing MSE = 0.028536978726928695
R-Squared Score: 0.48563724419384946
degree = 7
Training MSE = 0.027207785351941034
R-Squared Score: 0.5143454485130561
Testing MSE = 0.02818486844597668
R-Squared Score: 0.49198383106246046


  model = cd_fast.enet_coordinate_descent(


In [15]:
Y_tr = np.log10(Y)
for i in range(1,8):
    PolynomialRegressionModel(X, Y_tr, i, True, "L2", alpha=1.0)

degree = 1
Training MSE = 0.03184134317094848
R-Squared Score: 0.4316371936782558
Testing MSE = 0.03137192191721278
R-Squared Score: 0.43453901106055726
degree = 2
Training MSE = 0.028745250701636697
R-Squared Score: 0.4869019416206476
Testing MSE = 0.029379324414003618
R-Squared Score: 0.47045444390194513
degree = 3
Training MSE = 0.027122086847441752
R-Squared Score: 0.5158751529056531
Testing MSE = 0.031872297266820206
R-Squared Score: 0.4255200309427122
degree = 4
Training MSE = 0.025652862438125836
R-Squared Score: 0.5421005700908518
Testing MSE = 0.07177972089127943
R-Squared Score: -0.2937885051508542


  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


degree = 5
Training MSE = 0.05459967179630486
R-Squared Score: 0.025404722414238856
Testing MSE = 1.3343441585158773
R-Squared Score: -23.050791961393205
degree = 6
Training MSE = 0.029868562490919966
R-Squared Score: 0.4668510085458818
Testing MSE = 0.11461119906594149
R-Squared Score: -1.0658014557853535
degree = 7
Training MSE = 0.03456851732486454
R-Squared Score: 0.38295757777367057
Testing MSE = 2.7628109795774773
R-Squared Score: -48.79809120038218


In [21]:
from scipy import stats

df = pd.read_csv('../austinHousingData.csv')
df_filtered = df[['livingAreaSqFt', 'numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'MedianStudentsPerTeacher', 'latestPrice']]
z = np.abs(stats.zscore(df_filtered))
threshold = 3
df_filtered_o = df_filtered[(z < threshold).all(axis=1)]
df_p = df_filtered_o.sample(len(df_filtered_o), random_state=25)
X_col = ['livingAreaSqFt', 'numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'MedianStudentsPerTeacher']
X = df_p[X_col]
Y = df_p["latestPrice"]

In [23]:
# no outlier, not transform
for i in range(1,8):
    PolynomialRegressionModel(X, Y, i, True)

degree = 1
Training MSE = 40097307192.25421
R-Squared Score: 0.34825684541559865
Testing MSE = 41967968283.03878
R-Squared Score: 0.343678599356966
degree = 2
Training MSE = 36477944795.19383
R-Squared Score: 0.40708609933367645
Testing MSE = 37187447681.978424
R-Squared Score: 0.41843937775659246
degree = 3
Training MSE = 34564474406.561646
R-Squared Score: 0.43818771973206283
Testing MSE = 35533705748.29866
R-Squared Score: 0.4443016309611031
degree = 4
Training MSE = 33999294344.294983
R-Squared Score: 0.44737417793794065
Testing MSE = 35607304343.97677
R-Squared Score: 0.4431506499778197
degree = 5
Training MSE = 33103829606.338383
R-Squared Score: 0.46192909581151964
Testing MSE = 34823099404.16197
R-Squared Score: 0.45541453849916247
degree = 6
Training MSE = 37583050661.63661
R-Squared Score: 0.38912366659245046
Testing MSE = 40292211170.3062
R-Squared Score: 0.3698851397343489
degree = 7
Training MSE = 33423379638.231064
R-Squared Score: 0.4567351174519575
Testing MSE = 3575655

In [24]:
# no outlier, yes transform
Y_tr = np.log10(Y)
for i in range(1,8):
    PolynomialRegressionModel(X, Y, i, True)

degree = 1
Training MSE = 40097307192.25421
R-Squared Score: 0.34825684541559865
Testing MSE = 41967968283.03878
R-Squared Score: 0.343678599356966
degree = 2
Training MSE = 36477944795.19383
R-Squared Score: 0.40708609933367645
Testing MSE = 37187447681.978424
R-Squared Score: 0.41843937775659246
degree = 3
Training MSE = 34564474406.561646
R-Squared Score: 0.43818771973206283
Testing MSE = 35533705748.29866
R-Squared Score: 0.4443016309611031
degree = 4
Training MSE = 33999294344.294983
R-Squared Score: 0.44737417793794065
Testing MSE = 35607304343.97677
R-Squared Score: 0.4431506499778197
degree = 5
Training MSE = 33103829606.338383
R-Squared Score: 0.46192909581151964
Testing MSE = 34823099404.16197
R-Squared Score: 0.45541453849916247
degree = 6
Training MSE = 37583050661.63661
R-Squared Score: 0.38912366659245046
Testing MSE = 40292211170.3062
R-Squared Score: 0.3698851397343489
degree = 7
Training MSE = 33423379638.231064
R-Squared Score: 0.4567351174519575
Testing MSE = 3575655

### It seem the max R-Score we can get is ~0.48, which indicates our model does not have good fit for this data.