### Polynomial Regression (with Outliers)

In [1]:
import numpy as np
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def min_max_normalization(x) :
    x_min = min(x)
    x_max = max(x)
    x = [(a - x_min)/(x_max - x_min) for a in x]
    return x

#Independent variables: numOfBathrooms, avgSchoolRating, numOfBedrooms, numOfHighSchools, and livingAreaSqFt
#Dependent varaibles: lateestPrice


df = pd.read_csv('austinHousingData.csv')
df_filtered = df[['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'latestPrice', 'MedianStudentsPerTeacher']]
'''z = np.abs(stats.zscore(df_filtered))
threshold = 3
df_filtered_o = df_filtered[(z < threshold).all(axis=1)]
df_p = df_filtered_o.sample(len(df_filtered_o), random_state=25)'''
X_col = ['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'livingAreaSqFt', 'MedianStudentsPerTeacher']
X = df_filtered[X_col]


sc_X = MinMaxScaler()
X = sc_X.fit_transform(X)

Y = df_filtered.latestPrice

Y = min_max_normalization(Y)

for degree in range(2, 9):
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)

    tSize = int(len(df_filtered)*0.1)
    #X_poly_train, X_poly_test, Y_train, Y_test = train_test_split(X_poly, Y, train_size = 0.9, random_state = 1)#X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]
    X_poly_train, X_poly_test, Y_train, Y_test = X_poly[:-tSize], X_poly[-tSize:], Y[:-tSize], Y[-tSize:]

    model = LinearRegression()
    model.fit(X_poly_train, Y_train)

    Y_train_pred = model.predict(X_poly_train)
    Y_test_pred = model.predict(X_poly_test)

    mse_train = mean_squared_error(Y_train,Y_train_pred)
    mse_test = mean_squared_error(Y_test,Y_test_pred)
    print("Degree =", degree)
    print("Training MSE for degree", degree, "=", mse_train)
    print("Testing MSE for degree", degree, "=", mse_test)

Degree = 2
Training MSE for degree 2 = 0.0003871091274112671
Testing MSE for degree 2 = 0.0022201753875286923
Degree = 3
Training MSE for degree 3 = 0.00032094422643738163
Testing MSE for degree 3 = 0.002171320176476008
Degree = 4
Training MSE for degree 4 = 0.0002662630614673965
Testing MSE for degree 4 = 0.0026706752716156535
Degree = 5
Training MSE for degree 5 = 0.0002293935357404074
Testing MSE for degree 5 = 0.01407138082923959
Degree = 6
Training MSE for degree 6 = 0.00020325148277141269
Testing MSE for degree 6 = 0.7436944865462277
Degree = 7
Training MSE for degree 7 = 0.00017568725394620125
Testing MSE for degree 7 = 15694.167724719007
Degree = 8
Training MSE for degree 8 = 0.00015000909307120934
Testing MSE for degree 8 = 3302357.643135973
