### Polynomial Regression

In [4]:
import numpy as np
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def min_max_normalization(x) :
    x_min = min(x)
    x_max = max(x)
    x = [(a - x_min)/(x_max - x_min) for a in x]
    return x

#Independent variables: numOfBathrooms, avgSchoolRating, numOfBedrooms, numOfHighSchools, and livingAreaSqFt
#Dependent varaibles: lateestPrice


df = pd.read_csv('austinHousingData.csv')
df_filtered = df[['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'latestPrice']]
z = np.abs(stats.zscore(df_filtered))
threshold = 3
df_filtered_o = df_filtered[(z < threshold).all(axis=1)]
df_p = df_filtered_o.sample(len(df_filtered_o), random_state=25)
X_col = ['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt']
X = df_p[X_col]


sc_X = MinMaxScaler()
X = sc_X.fit_transform(X)

Y = df_p.latestPrice

Y = min_max_normalization(Y)

for degree in range(2, 11):
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)

    tSize = int(len(df_p)*0.1)
    #X_poly_train, X_poly_test, Y_train, Y_test = train_test_split(X_poly, Y, train_size = 0.9, random_state = 1)#X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]
    X_poly_train, X_poly_test, Y_train, Y_test = X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]

    model = LinearRegression()
    model.fit(X_poly_train, Y_train)

    Y_train_pred = model.predict(X_poly_train)
    Y_test_pred = model.predict(X_poly_test)

    mse_train = mean_squared_error(Y_train,Y_train_pred)
    mse_test = mean_squared_error(Y_test,Y_test_pred)
    print("Degree =", degree)
    print("Training MSE for degree", degree, "=", mse_train)
    print("Testing MSE for degree", degree, "=", mse_test)

Degree = 2
Training MSE for degree 2 = 0.010854668752436653
Testing MSE for degree 2 = 0.011020804359925477
Degree = 3
Training MSE for degree 3 = 0.010563644154528809
Testing MSE for degree 3 = 0.01063063347685935
Degree = 4
Training MSE for degree 4 = 0.010405086005406497
Testing MSE for degree 4 = 0.010809037066710478
Degree = 5
Training MSE for degree 5 = 0.010073471041957771
Testing MSE for degree 5 = 0.010504569673717731
Degree = 6
Training MSE for degree 6 = 0.009802950572958232
Testing MSE for degree 6 = 0.01150035595674514
Degree = 7
Training MSE for degree 7 = 0.009508859712581563
Testing MSE for degree 7 = 0.1253807057002417
Degree = 8
Training MSE for degree 8 = 0.00912138096282003
Testing MSE for degree 8 = 0.025506623944692473
Degree = 9
Training MSE for degree 9 = 0.008919545377408937
Testing MSE for degree 9 = 274.9843993763207
Degree = 10
Training MSE for degree 10 = 0.008636044189581906
Testing MSE for degree 10 = 458099.4981559205


Based on the results, it seems like the optimal degree is 3. Overfitting occurs when the degree becomes greater than 3, as the testing MSE becomes much greater than the training MSE. 

### 3rd-degree polynomial regression

In [5]:
import numpy as np
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def min_max_normalization(x) :
    x_min = min(x)
    x_max = max(x)
    x = [(a - x_min)/(x_max - x_min) for a in x]
    return x

#Independent variables: numOfBathrooms, avgSchoolRating, numOfBedrooms, numOfHighSchools, and livingAreaSqFt
#Dependent varaibles: lateestPrice


df = pd.read_csv('austinHousingData.csv')
df_filtered = df[['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'latestPrice']]
z = np.abs(stats.zscore(df_filtered))
threshold = 3
df_filtered_o = df_filtered[(z < threshold).all(axis=1)]
df_p = df_filtered_o.sample(len(df_filtered_o), random_state=25)
X_col = ['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt']
X = df_p[X_col]


sc_X = MinMaxScaler()
X = sc_X.fit_transform(X)

Y = df_p.latestPrice

Y = min_max_normalization(Y)

degree = 3
poly = PolynomialFeatures(degree)
X_poly = poly.fit_transform(X)

tSize = int(len(df_p)*0.1)
#X_poly_train, X_poly_test, Y_train, Y_test = train_test_split(X_poly, Y, train_size = 0.9, random_state = 1)#X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]
X_poly_train, X_poly_test, Y_train, Y_test = X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]

model = LinearRegression()
model.fit(X_poly_train, Y_train)

Y_train_pred = model.predict(X_poly_train)
Y_test_pred = model.predict(X_poly_test)

mse_train = mean_squared_error(Y_train,Y_train_pred)
mse_test = mean_squared_error(Y_test,Y_test_pred)
print("Degree =", degree)
print("Training MSE for degree", degree, "=", mse_train)
print("Testing MSE for degree", degree, "=", mse_test)

Degree = 3
Training MSE for degree 3 = 0.010563644154528809
Testing MSE for degree 3 = 0.01063063347685935


In [6]:
from statsmodels.regression.linear_model import OLS
print("y-intercept = %.4f" % model.intercept_)
print("coefficient = ", model.coef_)
print(len(model.coef_))

y-intercept = 0.1953
coefficient =  [-2.53853096e-15 -2.40620915e-01 -2.00992157e-02  1.31155609e-02
 -3.47499807e-14  7.85618419e-02 -4.27739759e-01  2.25939804e-01
 -3.41670080e-01  8.04911693e-16  1.84826127e+00  2.43913218e-01
 -7.18676471e-01  4.32986980e-15  2.00131797e+00  2.08892130e-01
  1.22124533e-15 -3.33404839e-01 -5.66213743e-15  2.44249065e-15
 -8.59248940e-01  7.91750914e-01 -2.27846768e-01  9.48136727e-02
  1.99840144e-15 -6.26758631e-01 -5.01835892e-03  7.55431071e-01
  1.11022302e-16 -1.80414345e+00  4.55942882e-01 -3.88578059e-16
 -2.38108753e+00  0.00000000e+00  6.66133815e-16  1.84858883e+00
 -3.44728803e-01  8.81134210e-01  0.00000000e+00 -1.72820009e+00
 -2.05616666e-01  0.00000000e+00 -6.38624066e-01  0.00000000e+00
  0.00000000e+00  1.74592174e+00 -1.35350630e-02  0.00000000e+00
 -3.21039704e-01  0.00000000e+00  0.00000000e+00  2.29822487e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.91877596e+00]
56
