In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso

def PolynomialRegressionModel (X, Y, degree, showTraining=False, regularization="None", alpha=0.5):
    
    trainX, testX, trainY, testY = train_test_split(X, Y, train_size = 0.8, random_state = 1)
    
    poly_feature = PolynomialFeatures(degree)
    polyRegModel = None
    
    if regularization == "L1":
        polyRegModel = Lasso(alpha=alpha)
    elif regularization == "L2":
        polyRegModel = Ridge(alpha=alpha)
    else:
        polyRegModel = LinearRegression()
    
    X_train_poly = poly_feature.fit_transform(trainX)
    X_test_poly = poly_feature.fit_transform(testX)
    
    polyRegModel.fit(X_train_poly, trainY)
    
    print("degree = {}".format(degree))
    
    if showTraining:
        train_pred = polyRegModel.predict(X_train_poly)    
        print("Training MSE =", mean_squared_error(trainY, train_pred))
        print("Testing RMSE =", mean_squared_error(trainY, train_pred, squared=False))
        print("R-Squared Score:", r2_score(trainY, train_pred))
        
    test_pred = polyRegModel.predict(X_test_poly)
    print("Testing MSE =", mean_squared_error(testY, test_pred))
    print("Testing RMSE =", mean_squared_error(testY, test_pred, squared=False))
    print("R-Squared Score:", r2_score(testY, test_pred))

In [17]:
df = pd.read_csv('../austinHousingData.csv')
df_filtered = df[['livingAreaSqFt','numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'MedianStudentsPerTeacher', 'latestPrice']]
dropRows = [705,1399,14275,2316, 2838, 14639, 14376, 14654, 2557]
X = df_filtered.drop(columns = "latestPrice")
Y = df_filtered["latestPrice"]
X = X.drop(dropRows, axis=0)
Y = Y.drop(dropRows, axis=0)


### Raw data, not do any outlier clean up, no transform

In [18]:
for i in range(2,8):
    PolynomialRegressionModel(X, Y, i, True)

degree = 2
Training MSE = 76975856800.7875
Testing RMSE = 277445.23207434564
R-Squared Score: 0.5342501350804754
Testing MSE = 84909616528.02756
Testing RMSE = 291392.54713878245
R-Squared Score: 0.48404638152005597
degree = 3
Training MSE = 69162849652.47954
Testing RMSE = 262988.3070641726
R-Squared Score: 0.5815234903268763
Testing MSE = 129183973340.98683
Testing RMSE = 359421.7207417866
R-Squared Score: 0.21501307837260686
degree = 4
Training MSE = 74334863191.1964
Testing RMSE = 272644.2062307512
R-Squared Score: 0.550229722291875
Testing MSE = 2738442796778.8315
Testing RMSE = 1654824.0984403242
R-Squared Score: -15.64015841517778
degree = 5
Training MSE = 66360853091.43587
Testing RMSE = 257606.0036013056
R-Squared Score: 0.5984772414645669
Testing MSE = 240804929819.91977
Testing RMSE = 490718.7889412018
R-Squared Score: -0.4632521022796656
degree = 6
Training MSE = 65861550682.33807
Testing RMSE = 256635.05349491537
R-Squared Score: 0.6014983189719356
Testing MSE = 1248857993

### If do log10 transform:

In [19]:
Y_tr = np.log10(Y)
for i in range(1,8):
    PolynomialRegressionModel(X, Y_tr, i, True)

degree = 1
Training MSE = 0.03184134268510467
Testing RMSE = 0.1784414264824866
R-Squared Score: 0.43163720235048897
Testing MSE = 0.03137194214737487
Testing RMSE = 0.1771212639616567
R-Squared Score: 0.43453864642343076
degree = 2
Training MSE = 0.028741225909728593
Testing RMSE = 0.16953237422312173
R-Squared Score: 0.48697378350280274
Testing MSE = 0.029383387687235252
Testing RMSE = 0.17141583266208302
R-Squared Score: 0.4703812057208112
degree = 3
Training MSE = 0.027423291611199517
Testing RMSE = 0.1655997935119471
R-Squared Score: 0.5104986967716343
Testing MSE = 0.03254434613348047
Testing RMSE = 0.1804005158902836
R-Squared Score: 0.4134067336522187
degree = 4
Training MSE = 0.02720163341647349
Testing RMSE = 0.16492917697143064
R-Squared Score: 0.5144552595624163
Testing MSE = 0.07838891903191217
Testing RMSE = 0.27998021185775285
R-Squared Score: -0.4129155298374865
degree = 5
Training MSE = 0.026911499561640993
Testing RMSE = 0.16404724795509673
R-Squared Score: 0.51963410

### log(Y) transform + L1 regularization

In [20]:
Y_tr = np.log10(Y)
for i in range(1,8):
    PolynomialRegressionModel(X, Y_tr, i, True, "L1", alpha=1.0)

degree = 1
Training MSE = 0.03464720234724435
Testing RMSE = 0.18613758982871878
R-Squared Score: 0.38155306289830304
Testing MSE = 0.03362468716849623
Testing RMSE = 0.18337035520633163
R-Squared Score: 0.3939342030350653
degree = 2
Training MSE = 0.032067894615947716
Testing RMSE = 0.17907510886761374
R-Squared Score: 0.4275932871644912
Testing MSE = 0.03204477520714069
Testing RMSE = 0.17901054496073882
R-Squared Score: 0.4224112144997404


  model = cd_fast.enet_coordinate_descent(


degree = 3
Training MSE = 0.029423026467402406
Testing RMSE = 0.1715314153949719
R-Squared Score: 0.47480375423516885
Testing MSE = 0.029925504577766274
Testing RMSE = 0.17298989732861939
R-Squared Score: 0.4606098581492666


  model = cd_fast.enet_coordinate_descent(


degree = 4
Training MSE = 0.02837718089676018
Testing RMSE = 0.16845527862539714
R-Squared Score: 0.4934719278834375
Testing MSE = 0.028772613911745848
Testing RMSE = 0.16962492125789136
R-Squared Score: 0.48139005446198513


  model = cd_fast.enet_coordinate_descent(


degree = 5
Training MSE = 0.02791100435336626
Testing RMSE = 0.16706586830758177
R-Squared Score: 0.5017931035016345
Testing MSE = 0.029240800982526308
Testing RMSE = 0.17099941807657215
R-Squared Score: 0.4729512497004916


  model = cd_fast.enet_coordinate_descent(


degree = 6
Training MSE = 0.027511005508909864
Testing RMSE = 0.16586441905637828
R-Squared Score: 0.5089330179374087
Testing MSE = 0.028536978726928695
Testing RMSE = 0.16892891619532963
R-Squared Score: 0.48563724419384946
degree = 7
Training MSE = 0.027207785351941034
Testing RMSE = 0.16494782615100156
R-Squared Score: 0.5143454485130561
Testing MSE = 0.02818486844597668
Testing RMSE = 0.16788349664566998
R-Squared Score: 0.49198383106246046


  model = cd_fast.enet_coordinate_descent(


### log(Y) + L2 regluralization

In [21]:
Y_tr = np.log10(Y)
for i in range(1,8):
    PolynomialRegressionModel(X, Y_tr, i, True, "L2", alpha=1.0)

  return linalg.solve(A, Xy, sym_pos=True,


degree = 1
Training MSE = 0.03184134317094848
Testing RMSE = 0.17844142784384034
R-Squared Score: 0.4316371936782558
Testing MSE = 0.03137192191721278
Testing RMSE = 0.17712120685342223
R-Squared Score: 0.43453901106055726
degree = 2
Training MSE = 0.028745250701636697
Testing RMSE = 0.16954424408288443
R-Squared Score: 0.4869019416206476
Testing MSE = 0.029379324414003618
Testing RMSE = 0.1714039801579987
R-Squared Score: 0.47045444390194513
degree = 3
Training MSE = 0.027122086847441752
Testing RMSE = 0.16468784669016032
R-Squared Score: 0.5158751529056531
Testing MSE = 0.031872297266820206
Testing RMSE = 0.17852814138622575
R-Squared Score: 0.4255200309427122
degree = 4
Training MSE = 0.025652862438125836
Testing RMSE = 0.16016510992761762
R-Squared Score: 0.5421005700908518
Testing MSE = 0.07177972089127943


  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


Testing RMSE = 0.26791737698641244
R-Squared Score: -0.2937885051508542
degree = 5
Training MSE = 0.05459967179630486
Testing RMSE = 0.23366572661882798
R-Squared Score: 0.025404722414238856
Testing MSE = 1.3343441585158773
Testing RMSE = 1.15513815559693
R-Squared Score: -23.050791961393205
degree = 6
Training MSE = 0.029868562490919966
Testing RMSE = 0.17282523684612722
R-Squared Score: 0.4668510085458818
Testing MSE = 0.11461119906594149
Testing RMSE = 0.3385427581058875
R-Squared Score: -1.0658014557853535
degree = 7
Training MSE = 0.03456851732486454
Testing RMSE = 0.18592610716320754
R-Squared Score: 0.38295757777367057
Testing MSE = 2.7628109795774773
Testing RMSE = 1.6621705627213705
R-Squared Score: -48.79809120038218


In [22]:
from scipy import stats

df = pd.read_csv('../austinHousingData.csv')
df_filtered = df[['livingAreaSqFt', 'numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'MedianStudentsPerTeacher', 'latestPrice']]
z = np.abs(stats.zscore(df_filtered))
threshold = 3
df_filtered_o = df_filtered[(z < threshold).all(axis=1)]
df_p = df_filtered_o.sample(len(df_filtered_o), random_state=25)
X_col = ['livingAreaSqFt', 'numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'MedianStudentsPerTeacher']
X = df_p[X_col]
Y = df_p["latestPrice"]

### remove outliers (only use 3 SD range of data), no transform

In [23]:
# no outlier, not transform
for i in range(1,8):
    PolynomialRegressionModel(X, Y, i, True)

degree = 1
Training MSE = 40097307192.25421
Testing RMSE = 200243.12021204177
R-Squared Score: 0.34825684541559865
Testing MSE = 41967968283.03878
Testing RMSE = 204860.851025858
R-Squared Score: 0.343678599356966
degree = 2
Training MSE = 36477944795.19383
Testing RMSE = 190992.00191420014
R-Squared Score: 0.40708609933367645
Testing MSE = 37187447681.978424
Testing RMSE = 192840.4721057756
R-Squared Score: 0.41843937775659246
degree = 3
Training MSE = 34564474406.561646
Testing RMSE = 185915.23446603736
R-Squared Score: 0.43818771973206283
Testing MSE = 35533705748.29866
Testing RMSE = 188503.86136177334
R-Squared Score: 0.4443016309611031
degree = 4
Training MSE = 33999294344.294983
Testing RMSE = 184388.97565823988
R-Squared Score: 0.44737417793794065
Testing MSE = 35607304343.97677
Testing RMSE = 188698.97812117788
R-Squared Score: 0.4431506499778197
degree = 5
Training MSE = 33103829606.338383
Testing RMSE = 181944.57839226312
R-Squared Score: 0.46192909581151964
Testing MSE = 34

### remove outliers (only use 3 SD range of data), yes transform log(Y)

In [24]:
# no outlier, yes transform
Y_tr = np.log10(Y)
for i in range(1,8):
    PolynomialRegressionModel(X, Y, i, True)

degree = 1
Training MSE = 40097307192.25421
Testing RMSE = 200243.12021204177
R-Squared Score: 0.34825684541559865
Testing MSE = 41967968283.03878
Testing RMSE = 204860.851025858
R-Squared Score: 0.343678599356966
degree = 2
Training MSE = 36477944795.19383
Testing RMSE = 190992.00191420014
R-Squared Score: 0.40708609933367645
Testing MSE = 37187447681.978424
Testing RMSE = 192840.4721057756
R-Squared Score: 0.41843937775659246
degree = 3
Training MSE = 34564474406.561646
Testing RMSE = 185915.23446603736
R-Squared Score: 0.43818771973206283
Testing MSE = 35533705748.29866
Testing RMSE = 188503.86136177334
R-Squared Score: 0.4443016309611031
degree = 4
Training MSE = 33999294344.294983
Testing RMSE = 184388.97565823988
R-Squared Score: 0.44737417793794065
Testing MSE = 35607304343.97677
Testing RMSE = 188698.97812117788
R-Squared Score: 0.4431506499778197
degree = 5
Training MSE = 33103829606.338383
Testing RMSE = 181944.57839226312
R-Squared Score: 0.46192909581151964
Testing MSE = 34

### It seem the max R-Score we can get is ~0.48, which indicates our model does not have good fit for this data.