### Polynomial Regression

In [1]:
import numpy as np
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def min_max_normalization(x) :
    x_min = min(x)
    x_max = max(x)
    x = [(a - x_min)/(x_max - x_min) for a in x]
    return x

#Independent variables: livingAreaSqFt, numOfBathrooms, avgSchoolRating, numOfBedrooms, numOfHighSchools, and livingAreaSqFt
#Dependent varaibles: lateestPrice


df = pd.read_csv('austinHousingData.csv')
df_filtered = df[['livingAreaSqFt', 'numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'latestPrice', 'MedianStudentsPerTeacher']]
z = np.abs(stats.zscore(df_filtered))
threshold = 3
df_filtered_o = df_filtered[(z < threshold).all(axis=1)]
df_p = df_filtered_o.sample(len(df_filtered_o), random_state=25)
X_col = ['livingAreaSqFt', 'numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'MedianStudentsPerTeacher']
X = df_p[X_col]


sc_X = MinMaxScaler()
X = sc_X.fit_transform(X)

Y = df_p.latestPrice

Y = min_max_normalization(Y)

for degree in range(2, 9):
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)

    tSize = int(len(df_p)*0.1)
    #X_poly_train, X_poly_test, Y_train, Y_test = train_test_split(X_poly, Y, train_size = 0.9, random_state = 1)#X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]
    X_poly_train, X_poly_test, Y_train, Y_test = X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]

    model = LinearRegression()
    model.fit(X_poly_train, Y_train)

    Y_train_pred = model.predict(X_poly_train)
    Y_test_pred = model.predict(X_poly_test)

    mse_train = mean_squared_error(Y_train,Y_train_pred)
    mse_test = mean_squared_error(Y_test,Y_test_pred)
    print("Degree =", degree)
    print("Training MSE for degree", degree, "=", mse_train)
    print("Testing MSE for degree", degree, "=", mse_test)

Degree = 2
Training MSE for degree 2 = 0.010746371638354419
Testing MSE for degree 2 = 0.01089934580186431
Degree = 3
Training MSE for degree 3 = 0.010197599374599346
Testing MSE for degree 3 = 0.01030798802507296
Degree = 4
Training MSE for degree 4 = 0.009815512688377092
Testing MSE for degree 4 = 0.010125519364295569
Degree = 5
Training MSE for degree 5 = 0.009124704028285482
Testing MSE for degree 5 = 0.010129745189695628
Degree = 6
Training MSE for degree 6 = 0.008782411679986512
Testing MSE for degree 6 = 0.013134496334493295
Degree = 7
Training MSE for degree 7 = 0.007859398614004929
Testing MSE for degree 7 = 0.012214095789302416
Degree = 8
Training MSE for degree 8 = 0.006905841209707125
Testing MSE for degree 8 = 3.6452080547096566


Based on the results, it seems like the optimal degree is 4. Overfitting occurs when the degree becomes greater than 4, as the testing MSE becomes much greater than the training MSE. 

### 4th-degree polynomial regression

In [2]:
import numpy as np
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def min_max_normalization(x) :
    x_min = min(x)
    x_max = max(x)
    x = [(a - x_min)/(x_max - x_min) for a in x]
    return x

#Independent variables: livingAreaSqFt, numOfBathrooms, avgSchoolRating, numOfBedrooms, numOfHighSchools, and livingAreaSqFt
#Dependent varaibles: lateestPrice


df = pd.read_csv('austinHousingData.csv')
df_filtered = df[['livingAreaSqFt', 'numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'latestPrice', 'MedianStudentsPerTeacher']]
z = np.abs(stats.zscore(df_filtered))
threshold = 3
df_filtered_o = df_filtered[(z < threshold).all(axis=1)]
df_p = df_filtered_o.sample(len(df_filtered_o), random_state=25)
X_col = ['livingAreaSqFt', 'numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'MedianStudentsPerTeacher']
X = df_p[X_col]


sc_X = MinMaxScaler()
X = sc_X.fit_transform(X)

Y = df_p.latestPrice

Y = min_max_normalization(Y)

degree = 4
poly = PolynomialFeatures(degree)
X_poly = poly.fit_transform(X)

tSize = int(len(df_p)*0.1)
#X_poly_train, X_poly_test, Y_train, Y_test = train_test_split(X_poly, Y, train_size = 0.9, random_state = 1)#X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]
X_poly_train, X_poly_test, Y_train, Y_test = X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]

model = LinearRegression()
model.fit(X_poly_train, Y_train)

Y_train_pred = model.predict(X_poly_train)
Y_test_pred = model.predict(X_poly_test)

mse_train = mean_squared_error(Y_train,Y_train_pred)
mse_test = mean_squared_error(Y_test,Y_test_pred)
print("Degree =", degree)
print("Training MSE for degree", degree, "=", mse_train)
print("Testing MSE for degree", degree, "=", mse_test)

Degree = 4
Training MSE for degree 4 = 0.009815512688377092
Testing MSE for degree 4 = 0.010125519364295569


In [3]:
from statsmodels.regression.linear_model import OLS
print("y-intercept = %.4f" % model.intercept_)
print("coefficient = ", model.coef_)
print(len(model.coef_))

y-intercept = 0.1362
coefficient =  [ 5.56724025e-13  1.11150503e+11 -5.11728943e+11  3.05864375e-02
  2.74895945e-01  4.93615627e-01 -3.72519089e+10  1.86656864e+11
  2.13921577e+11 -2.23060608e-01  1.24753548e+11 -5.49323189e+10
  1.05997952e+10 -3.23759214e+09 -2.34026320e+10  4.77874499e+09
 -1.53620668e+10 -6.39647632e+09  1.31852054e+09 -7.59647545e+09
 -2.66296173e+09  2.16416397e+08  4.74146727e+09  2.50580491e+08
 -6.17472391e+09 -5.94034709e+09  2.33162886e+09 -8.15177917e-01
 -4.42771912e-01  8.77182007e-01  6.70398774e+08 -2.03355032e+09
 -5.90328319e+09 -1.56043625e+00  8.86352539e-01 -6.69219971e-01
 -2.93592513e+07  1.37761621e+09  1.64355954e+09 -2.20965195e+00
 -1.20800781e+00  2.26354796e+08  9.62325793e+09  9.03790676e+09
 -1.16859436e+00  1.94718575e+08 -1.12700124e+08 -5.36258035e+07
  3.55690507e+08 -9.48745639e+09 -9.37314073e+09 -1.76743670e+09
 -9.49054235e+09 -1.88271269e+09  2.75762558e+00  1.87812465e+07
 -6.63694683e+07  9.94040783e+08 -1.51107496e+09  5.39