In [356]:
import pandas as pd
import numpy as np

train_set = pd.read_csv(r'train.csv')
test_set = pd.read_csv(r'test.csv')
x = train_set.drop(columns=["Id", "Salary", "DOB", "PIMGP"]).to_numpy()
y = train_set.loc[:, ["Salary"]].to_numpy()[:,0]
test = test_set.drop(columns=["Id", "DOB", "PIMGP"]).to_numpy()

In [357]:
x_n = (x - x.mean(0)) / x.std(0)
y_n = (y - y.mean()) / y.std()
test_n = (test - test.mean(0)) / test.std(0)

In [358]:
def get_phi(x, indexes=None, degree=1):
    if degree == 1:
        degrees = np.arange(degree+1)
        phi = np.tile(x, [degree+1,1]).T
        phi = phi**degrees
    else:
        phi = np.append(np.ones((1, x.shape[0])), x.T[indexes], axis=0).T
        phi = np.tile(x.T[indexes], [degree,1])
        degrees = np.linspace(1, degree+1, phi.shape[0]+1, dtype=int)
        degrees = np.delete(degrees, phi.shape[0])
        phi = phi.T**degrees
    
    return phi

In [359]:
def poly_regr(x, y, indexes=None, degree=1):
    phi = get_phi(x, indexes, degree)
    w = None
    try :
        w = np.linalg.inv(phi.T.dot(phi)).dot(phi.T.dot(y))
        y_pred = phi.dot(w) 
        rmse = np.sqrt(np.sum((y - y_pred)**2)/y_pred.size)
    except np.linalg.LinAlgError:
        rmse = 100
    return (w , rmse)

In [360]:
def get_best_attributes(x, y, degree, nb_max_attributes):
    x = x.T
    best_attributes = np.full((nb_max_attributes), -1)

    for i in range(nb_max_attributes):
        best_rmse = 2
        id_best_rmse = None
        for j in range(x.shape[0]):
            w, rmse = poly_regr(x[j], y)
            if rmse < best_rmse and np.argwhere(best_attributes==j).size == 0 :
                best_rmse = rmse
                id_best_rmse = j
        best_attributes[i] = id_best_rmse

    return best_attributes

In [361]:
degree = 3
nb_attributes = 20

indexes = get_best_attributes(x_n, y_n, degree, nb_attributes)
w, rmse = poly_regr(x_n, y_n, best_attributes, degree)

print(rmse)

0.556122848172312


In [362]:
def test_data(x, y, w, indexes, degree=1):
    phi = get_phi(x, indexes, degree)
    y_pred_n = phi.dot(w)
    y_pred = (y.std() * y_pred_n) + y.mean()
    
    return y_pred

In [363]:
y_pred = test_data(test_n, y, w, best_attributes, degree)

In [364]:
df = pd.DataFrame({"Id": range(y_pred.size), "Salary": y_pred})
compression_opts = dict(method='zip', archive_name='out.csv') 
df.to_csv('sampleSubmission.zip', index=False, compression=compression_opts)

In [219]:
# 0.7486788923831885
# 0.6411587970232053
# 0.556122848172312