In [629]:
import pandas as pd
import numpy as np

train_set = pd.read_csv(r'train.csv')
test_set = pd.read_csv(r'test.csv')
x = train_set.drop(columns=["Id", "Salary", "DOB", "PIMGP"]).to_numpy()
y = train_set.loc[:, ["Salary"]].to_numpy()[:,0]
test = test_set.drop(columns=["Id", "DOB", "PIMGP"]).to_numpy()

In [630]:
x_n = (x - x.mean(0)) / x.std(0)
y_n = (y - y.mean()) / y.std()
test_n = (test - test.mean(0)) / test.std(0)

In [631]:
x_train = np.array_split(x_n, 6)
x_test = x_train[5]
x_train = np.delete(x_train, 5, 0)

y_train = np.array_split(y_n, 6)
y_test = y_train[5]
y_train = np.delete(y_train, 5, 0)

In [632]:
def get_phi(x, indexes=None, degree=1):
    if degree == 1:
        degrees = np.arange(degree+1)
        phi = np.tile(x, [degree+1,1]).T
        phi = phi**degrees
    else:
#         phi = np.append(np.ones((1, x.shape[0])), x.T[indexes], axis=0).T
        phi = np.tile(x.T[indexes], [degree,1])
        degrees = np.linspace(1, degree+1, phi.shape[0]+1, dtype=int)
        degrees = np.delete(degrees, phi.shape[0])
        phi = phi.T**degrees
        phi = np.c_[phi, np.ones((x.shape[0], 1))]
    return phi

In [633]:
def poly_regr(x, y, indexes=None, degree=1):
    phi = get_phi(x, indexes, degree)
    w = None
    try :
        w = np.linalg.inv(phi.T.dot(phi)).dot(phi.T.dot(y))
        y_pred = phi.dot(w) 
        rmse = np.sqrt(np.sum((y - y_pred)**2)/y_pred.size)
    except np.linalg.LinAlgError:
        rmse = 100
    
    return (w, rmse)

In [634]:
def get_best_attributes_from_x(x, y, degree, nb_max_attributes):
    x = x.T
    best_attributes = np.full((nb_max_attributes), -1)
    for i in range(nb_max_attributes):
        best_rmse = 2
        id_best_rmse = None
        for j in range(x.shape[0]):
            w, rmse = poly_regr(x[j], y)
            if rmse < best_rmse and np.argwhere(best_attributes==j).size == 0 :
                best_rmse = rmse
                id_best_rmse = j
        best_attributes[i] = id_best_rmse

    return best_attributes

In [635]:
def get_best_attributes(x, y, degree, nb_attributes):
    best_indexes = None
    best_rmse = 2
    
    for i in range(len(x)):
        indexes = get_best_attributes_from_x(x[i], y[i], degree, nb_attributes)
        w, rmse = poly_regr(x[i], y[i], indexes, degree)

        if rmse < best_rmse:
            best_rmse = rmse
            best_indexes = indexes   
            
    return best_indexes

In [636]:
def get_all_w(x, y, indexes, degree):
    all_w = np.zeros((len(indexes)*degree+1, len(x)))

    for i in range(len(x)):
        w, rmse = poly_regr(x[i], y[i], indexes, degree)
        all_w.T[i] = w
        
    return all_w

In [637]:
degree = 2
nb_attributes = 5
best_indexes = get_best_attributes(x_train, y_train, degree, nb_attributes)

In [638]:
all_w = get_all_w(x_train, y_train, best_indexes, degree)
phi = get_phi(x_test, best_indexes, degree)
y_pred = phi.dot(all_w)
y_test_pred_avg = np.mean(y_pred, axis=1)
rmse = np.sqrt(np.sum((y_test - y_test_pred_avg)**2)/y_test.size)

print(rmse)

0.7208192548835631


In [639]:
def test_data(x, y, all_w, indexes, degree=1):
    phi = get_phi(x, indexes, degree)
    y_pred_n = phi.dot(all_w)
    y_test_pred_avg = np.mean(y_pred_n, axis=1)
    y_pred = (y.std() * y_test_pred_avg) + y.mean()
    
    return y_pred

In [640]:
y_pred = test_data(test_n, y, all_w, best_indexes, degree)

In [628]:
df = pd.DataFrame({"Id": range(y_pred.size), "Salary": y_pred})
compression_opts = dict(method='zip', archive_name='out.csv') 
df.to_csv('sampleSubmission.zip', index=False, compression=compression_opts)

In [219]:
# 0.7486788923831885
# 0.6411587970232053
# 0.556122848172312