### Polynomial Regression

In [2]:
import numpy as np
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def min_max_normalization(x) :
    x_min = min(x)
    x_max = max(x)
    x = [(a - x_min)/(x_max - x_min) for a in x]
    return x

#Independent variables: numOfBathrooms, avgSchoolRating, numOfBedrooms, numOfHighSchools, and livingAreaSqFt
#Dependent varaibles: lateestPrice


df = pd.read_csv('austinHousingData.csv')
df_filtered = df[['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'latestPrice', 'MedianStudentsPerTeacher']]
z = np.abs(stats.zscore(df_filtered))
threshold = 3
df_filtered_o = df_filtered[(z < threshold).all(axis=1)]
df_p = df_filtered_o.sample(len(df_filtered_o), random_state=25)
X_col = ['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'MedianStudentsPerTeacher']
X = df_p[X_col]


sc_X = MinMaxScaler()
X = sc_X.fit_transform(X)

Y = df_p.latestPrice

Y = min_max_normalization(Y)

for degree in range(2, 9):
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)

    tSize = int(len(df_p)*0.1)
    #X_poly_train, X_poly_test, Y_train, Y_test = train_test_split(X_poly, Y, train_size = 0.9, random_state = 1)#X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]
    X_poly_train, X_poly_test, Y_train, Y_test = X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]

    model = LinearRegression()
    model.fit(X_poly_train, Y_train)

    Y_train_pred = model.predict(X_poly_train)
    Y_test_pred = model.predict(X_poly_test)

    mse_train = mean_squared_error(Y_train,Y_train_pred)
    mse_test = mean_squared_error(Y_test,Y_test_pred)
    print("Degree =", degree)
    print("Training MSE for degree", degree, "=", mse_train)
    print("Testing MSE for degree", degree, "=", mse_test)

Degree = 2
Training MSE for degree 2 = 0.01074463698236266
Testing MSE for degree 2 = 0.01088617504474919
Degree = 3
Training MSE for degree 3 = 0.010197520354704276
Testing MSE for degree 3 = 0.01030487245758024
Degree = 4
Training MSE for degree 4 = 0.009781156024117174
Testing MSE for degree 4 = 0.010232086628164975
Degree = 5
Training MSE for degree 5 = 0.009121447722147798
Testing MSE for degree 5 = 0.010224789963801963
Degree = 6
Training MSE for degree 6 = 0.0087248095687636
Testing MSE for degree 6 = 0.012790387602393977
Degree = 7
Training MSE for degree 7 = 0.007842995657025365
Testing MSE for degree 7 = 0.011763087519733482
Degree = 8
Training MSE for degree 8 = 0.006905841180233509
Testing MSE for degree 8 = 3.6451997208983324


Based on the results, it seems like the optimal degree is 5. Overfitting occurs when the degree becomes greater than 5, as the testing MSE becomes much greater than the training MSE. 

### 5th-degree polynomial regression

In [4]:
import numpy as np
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def min_max_normalization(x) :
    x_min = min(x)
    x_max = max(x)
    x = [(a - x_min)/(x_max - x_min) for a in x]
    return x

#Independent variables: numOfBathrooms, avgSchoolRating, numOfBedrooms, numOfHighSchools, and livingAreaSqFt
#Dependent varaibles: lateestPrice


df = pd.read_csv('austinHousingData.csv')
df_filtered = df[['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'latestPrice', 'MedianStudentsPerTeacher']]
z = np.abs(stats.zscore(df_filtered))
threshold = 3
df_filtered_o = df_filtered[(z < threshold).all(axis=1)]
df_p = df_filtered_o.sample(len(df_filtered_o), random_state=25)
X_col = ['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'MedianStudentsPerTeacher']
X = df_p[X_col]


sc_X = MinMaxScaler()
X = sc_X.fit_transform(X)

Y = df_p.latestPrice

Y = min_max_normalization(Y)

degree = 5
poly = PolynomialFeatures(degree)
X_poly = poly.fit_transform(X)

tSize = int(len(df_p)*0.1)
#X_poly_train, X_poly_test, Y_train, Y_test = train_test_split(X_poly, Y, train_size = 0.9, random_state = 1)#X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]
X_poly_train, X_poly_test, Y_train, Y_test = X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]

model = LinearRegression()
model.fit(X_poly_train, Y_train)

Y_train_pred = model.predict(X_poly_train)
Y_test_pred = model.predict(X_poly_test)

mse_train = mean_squared_error(Y_train,Y_train_pred)
mse_test = mean_squared_error(Y_test,Y_test_pred)
print("Degree =", degree)
print("Training MSE for degree", degree, "=", mse_train)
print("Testing MSE for degree", degree, "=", mse_test)

Degree = 5
Training MSE for degree 5 = 0.009121447722147798
Testing MSE for degree 5 = 0.010224789963801963


In [5]:
from statsmodels.regression.linear_model import OLS
print("y-intercept = %.4f" % model.intercept_)
print("coefficient = ", model.coef_)
print(len(model.coef_))

y-intercept = -0.6132
coefficient =  [-3.45515455e-11 -8.24519751e-01  6.33428700e+00 -3.09782920e-01
 -2.07306838e-10  3.53208722e+00  3.68210064e+00  3.49238027e+00
 -3.40892061e-01  2.52285386e+00 -9.83835236e-12 -1.41969874e+01
  4.40436785e+00 -1.66596173e+01  2.30193125e+00 -3.17701421e-11
 -1.11496348e+01 -1.93359214e+01  1.23661253e+00  1.05976339e-11
  3.51844582e+00 -2.98007259e+00  1.03043685e-11  2.25012231e-11
 -1.55429003e-11 -5.72500214e-01 -3.61103962e+00 -1.17092799e+01
 -5.14136602e+00  8.11791286e+00 -5.31252226e+00 -7.56283924e-13
  1.47208266e+00 -1.21218493e+01  3.66912566e+00 -1.89764708e+01
  8.46811510e-12  1.75052287e+01 -2.48633037e-01  4.33841712e+00
 -1.15667476e-11  1.61000294e+01  1.87189660e+00 -1.46238577e-12
  1.24877886e-12 -1.31006317e-12  3.14217284e+01  7.69505301e+00
 -1.40446353e+01  1.94420117e+01 -5.42352104e+00 -1.86206606e-12
  1.85013107e+01  4.54483062e+01  8.95966845e+00 -1.63857816e-12
 -1.46620955e+01 -1.33079258e+00 -7.16982029e-13 -1.3