In [18]:
import pandas as pd
import numpy as np

train_set = pd.read_csv(r'train.csv')
test_set = pd.read_csv(r'test.csv')
x = train_set.loc[:, ["PPP"]].to_numpy()[:,0]
y = train_set.loc[:, ["Salary"]].to_numpy()[:,0]
test = test_set.loc[:, ["PPP"]].to_numpy()[:,0]

In [27]:
x_n = (x - x.mean()) / x.std()
y_n = (y - y.mean()) / y.std()
test_n = (test - test.mean()) / test.std()

In [20]:
def format_phi(x, degree):
    degrees = np.arange(degree+1)
    phi = np.tile(x, [degree+1,1]).T
    phi = phi**degrees
    
    return phi

In [21]:
def poly_regr(x,y,degree):
    phi = format_phi(x, degree)
    
    w = np.linalg.inv(phi.T.dot(phi)).dot(phi.T.dot(y))
    y_pred = phi.dot(w) 
    rmse = np.sqrt(np.sum((y - y_pred)**2)/y_pred.size)
    
    return (w , rmse)

In [22]:
def get_best_poly(x, y, max_degree):
    best_w = None
    best_rmse = 1
    best_degree = None
    
    for degree in range(max_degree):
        w, rmse = poly_regr(x,y,degree)
        if (rmse < best_rmse):
            best_rmse = rmse
            best_w = w
            best_degree = degree
    
    return (best_w, best_rmse, best_degree)

In [23]:
w, rmse, degree = get_best_poly(x_n, y_n, 30)
print('Best degree: ', degree)
print('Best RMSE: ', rmse)

Best degree:  11
Best RMSE:  0.7365482398712543


In [24]:
def test_data(x, y, w, degree):
    phi = format_phi(x, degree)
    y_pred_n = phi.dot(w)
    y_pred = (y.std() * y_pred_n) + y.mean()
    
    return y_pred

In [28]:
y_pred = test_data(test_n, y, w, degree)

In [9]:
df = pd.DataFrame({"Id": range(y_pred.size), "Salary": y_pred})
compression_opts = dict(method='zip', archive_name='out.csv') 
df.to_csv('sampleSubmission.zip', index=False, compression=compression_opts)

In [10]:
# 0.7486788923831885
