In [10]:
import numpy as np
import operator
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from scipy import stats
import joblib

def min_max_normalization(x) :
    x_min = min(x)
    x_max = max(x)
    x = [(a - x_min)/(x_max - x_min) for a in x]
    return x

#Independent variables: numOfBathrooms, avgSchoolRating, numOfBedrooms, numOfHighSchools, and MedianStudentsPerTeacher
#Dependent varaibles: lateestPrice

df = pd.read_csv('austinHousingData.csv')
df_filtered = df[['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'MedianStudentsPerTeacher', 'latestPrice']]

z = np.abs(stats.zscore(df_filtered))
threshold = 3
df_filtered_o = df_filtered[(z < threshold).all(axis=1)]

X = df_filtered_o[['numOfBathrooms', 'avgSchoolRating', 'numOfBedrooms', 'numOfHighSchools', 'MedianStudentsPerTeacher']]
y = df_filtered_o.latestPrice
X.to_csv("Poly_X_Test")
y.to_csv("Poly_y_Test")
y = y.to_numpy()

def createPolyModel(degree, X, y):
    y = min_max_normalization(y)
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)

    tSize = int(len(df)*0.1)
    X_poly_train, X_poly_test, y_train, y_test = X_poly[:-tSize], X_poly[-tSize:], y[:-tSize], y[-tSize:]
    
    model = LinearRegression()
    model.fit(X_poly_train, y_train)

    y_train_pred = model.predict(X_poly_train)
    y_test_pred = model.predict(X_poly_test)

    mse_train = mean_squared_error(y_train,y_train_pred)
    mse_test = mean_squared_error(y_test,y_test_pred)
    print("Training MSE for degree", degree, "=", mse_train)
    print("Testing MSE for degree", degree, "=", mse_test, "\n")
    return model

joblib.dump(createPolyModel(2,X,y), "PolyModel2.sav")
joblib.dump(createPolyModel(3,X,y), "PolyModel3.sav")
joblib.dump(createPolyModel(4,X,y), "PolyModel4.sav")
joblib.dump(createPolyModel(5,X,y), "PolyModel5.sav")
joblib.dump(createPolyModel(6,X,y), "PolyModel6.sav")
joblib.dump(createPolyModel(7,X,y), "PolyModel7.sav")
joblib.dump(createPolyModel(8,X,y), "PolyModel8.sav")
joblib.dump(createPolyModel(9,X,y), "PolyModel9.sav")
joblib.dump(createPolyModel(10,X,y), "PolyModel10.sav")

Training MSE for degree 2 = 0.009146443807497219
Testing MSE for degree 2 = 0.03902550365777494 

Training MSE for degree 3 = 0.008844474843473024
Testing MSE for degree 3 = 0.036881504586442834 

Training MSE for degree 4 = 0.00852585520851768
Testing MSE for degree 4 = 0.035876898563775454 

Training MSE for degree 5 = 0.008220391001256887
Testing MSE for degree 5 = 0.03429538262919712 

Training MSE for degree 6 = 0.008070095601069223
Testing MSE for degree 6 = 0.03404204459494975 

Training MSE for degree 7 = 0.007650372564108174
Testing MSE for degree 7 = 0.03218727300123978 

Training MSE for degree 8 = 0.007347875523953226
Testing MSE for degree 8 = 0.09929094044958985 

Training MSE for degree 9 = 0.007043165250374688
Testing MSE for degree 9 = 20.055278464305456 

Training MSE for degree 10 = 0.006779514831762393
Testing MSE for degree 10 = 93.70954131663906 



['PolyModel10.sav']