### Polynomial Regression

In [5]:
import numpy as np
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def min_max_normalization(x) :
    x_min = min(x)
    x_max = max(x)
    x = [(a - x_min)/(x_max - x_min) for a in x]
    return x

#Independent variables: numOfBathrooms, avgSchoolRating, numOfBedrooms, numOfHighSchools, and MedianStudentsPerTeacher
#Dependent varaibles: lateestPrice


df = pd.read_csv('austinHousingData.csv')
labelsToRemove = ['zpid', 'city', 'streetAddress', 'latitude', 'longitude', 'description', 'hasGarage', 'numPriceChanges', 'latest_saledate', 'latest_salemonth', 'latest_saleyear', 'latestPriceSource', 'numOfPhotos', 'homeImage']

df.drop(labelsToRemove, axis = 1, inplace=True)

df_p = df.sample(len(df), random_state=25)
X_col = ['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt']
X = df_p[X_col]


sc_X = MinMaxScaler()
X = sc_X.fit_transform(X)

Y = df_p.latestPrice

Y = min_max_normalization(Y)

for degree in range(2, 11):
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)

    tSize = int(len(df_p)*0.1)
    #X_poly_train, X_poly_test, Y_train, Y_test = train_test_split(X_poly, Y, train_size = 0.9, random_state = 1)#X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]
    X_poly_train, X_poly_test, Y_train, Y_test = X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]

    model = LinearRegression()
    model.fit(X_poly_train, Y_train)

    Y_train_pred = model.predict(X_poly_train)
    Y_test_pred = model.predict(X_poly_test)

    mse_train = mean_squared_error(Y_train,Y_train_pred)
    mse_test = mean_squared_error(Y_test,Y_test_pred)
    print("Degree =", degree)
    print("Training MSE for degree", degree, "=", mse_train)
    print("Testing MSE for degree", degree, "=", mse_test)


Degree = 2
Training MSE for degree 2 = 0.0005794768405008606
Testing MSE for degree 2 = 0.0004232466214526671
Degree = 3
Training MSE for degree 3 = 0.0005151429097512561
Testing MSE for degree 3 = 0.0004152630210698791
Degree = 4
Training MSE for degree 4 = 0.00047361646333528957
Testing MSE for degree 4 = 0.00041810288669832924
Degree = 5
Training MSE for degree 5 = 0.00044009676762955065
Testing MSE for degree 5 = 0.0003874874428617258
Degree = 6
Training MSE for degree 6 = 0.00040520680567205416
Testing MSE for degree 6 = 0.003894839895630842
Degree = 7
Training MSE for degree 7 = 0.00037769623995625415
Testing MSE for degree 7 = 1.5111536136469692
Degree = 8
Training MSE for degree 8 = 0.00037116429424346835
Testing MSE for degree 8 = 16473.365624745198
Degree = 9
Training MSE for degree 9 = 0.000327817899385261
Testing MSE for degree 9 = 3379.802907246486
Degree = 10
Training MSE for degree 10 = 0.0003244819922751131
Testing MSE for degree 10 = 8035.663753154075


Based on the results, it seems like the optimal degree is 3. Overfitting occurs when the degree becomes greater than 3, as the testing MSE becomes much greater than the training MSE. 

### 3rd-degree polynomial regression

In [42]:
import numpy as np
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def min_max_normalization(x) :
    x_min = min(x)
    x_max = max(x)
    x = [(a - x_min)/(x_max - x_min) for a in x]
    return x

#Independent variables: numOfBathrooms, avgSchoolRating, numOfBedrooms, numOfHighSchools, and MedianStudentsPerTeacher
#Dependent varaibles: lateestPrice


df = pd.read_csv('austinHousingData.csv')
labelsToRemove = ['zpid', 'city', 'streetAddress', 'latitude', 'longitude', 'description', 'hasGarage', 'numPriceChanges', 'latest_saledate', 'latest_salemonth', 'latest_saleyear', 'latestPriceSource', 'numOfPhotos', 'homeImage']

df.drop(labelsToRemove, axis = 1, inplace=True)

df_p = df.sample(len(df), random_state=25)
X_col = ['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt']
X = df_p[X_col]


sc_X = MinMaxScaler()

Y = df_p.latestPrice

Y = min_max_normalization(Y)

degree = 3
poly = PolynomialFeatures(degree)
X_poly = poly.fit_transform(X)

tSize = int(len(df_p)*0.1)
#X_poly_train, X_poly_test, Y_train, Y_test = train_test_split(X_poly, Y, train_size = 0.9, random_state = 1)#X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]
X_poly_train, X_poly_test, Y_train, Y_test = X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]

model = LinearRegression()
model.fit(X_poly_train, Y_train)

Y_train_pred = model.predict(X_poly_train)
Y_test_pred = model.predict(X_poly_test)

mse_train = mean_squared_error(Y_train,Y_train_pred)
mse_test = mean_squared_error(Y_test,Y_test_pred)
print("Degree =", degree)
print("Training MSE for degree", degree, "=", mse_train)
print("Testing MSE for degree", degree, "=", mse_test)

Degree = 3
Training MSE for degree 3 = 0.0005175219176832942
Testing MSE for degree 3 = 0.0004099689530503299


In [43]:
from statsmodels.regression.linear_model import OLS
print("y-intercept = %.4f" % model.intercept_)
print("coefficient = ", model.coef_)
print(len(model.coef_))

y-intercept = 0.3800
coefficient =  [ 3.10402555e-07 -5.18678110e-02 -1.13554804e-01  8.41674396e-02
 -2.62992259e-01 -1.00874414e-04  2.13508557e-03  7.22981997e-03
  1.00321737e-02  2.01257630e-02 -2.09372368e-06  9.48788414e-03
 -1.51258531e-02  1.16667548e-01  2.52690128e-05 -5.81441090e-03
 -8.22984646e-02  2.55201404e-06 -1.46930214e-01  1.18644978e-04
 -7.11108920e-09 -7.33819655e-05 -3.41044333e-05 -1.56304589e-04
  4.27882976e-04  3.06491468e-07 -8.98013263e-04 -6.70938911e-05
  1.06768514e-03  8.66555339e-07 -1.03164603e-04 -9.73707542e-03
 -3.80683093e-07 -1.46269414e-03  3.96146366e-06 -4.21471150e-10
 -6.02569901e-05  5.61183422e-04 -6.84573242e-03 -9.95343349e-07
  3.90321979e-04  9.03094156e-03 -1.13588528e-06 -2.06471654e-02
 -1.62487653e-05  7.38962634e-10  1.17881874e-06  3.50628146e-03
  3.82004811e-07  1.48092273e-02  9.96152230e-07  2.97430691e-10
  8.51237074e-02 -2.63197903e-05  1.56758886e-09 -3.80251386e-15]
56
