In [21]:
import pandas as pd
import numpy as np

train_set = pd.read_csv(r'train.csv')
test_set = pd.read_csv(r'test.csv')
x = train_set.drop(columns=["Id", "Salary", "DOB", "PIMGP"]).to_numpy()
y = train_set.loc[:, ["Salary"]].to_numpy()[:,0]
test = test_set.drop(columns=["Id", "DOB", "PIMGP"]).to_numpy()

In [22]:
x_n = (x - x.mean(0)) / x.std(0)
y_n = (y - y.mean()) / y.std()
test_n = (test - test.mean(0)) / test.std(0)

In [23]:
x_train = np.array_split(x_n, 6)
x_test = x_train[5]
x_train = np.delete(x_train, 5, 0)

y_train = np.array_split(y_n, 6)
y_test = y_train[5]
y_train = np.delete(y_train, 5, 0)

In [24]:
def get_phi(x, indexes=None, degree=1):
    if indexes is None:
        degrees = np.arange(degree+1)
        phi = np.tile(x, [degree+1,1]).T
        phi = phi**degrees
    else:
        phi = np.repeat(x.T[indexes], degree, axis=0)
        degrees = []
        for poly in degree:
            degrees = np.append(degrees, np.linspace(1, poly, poly, dtype=int))
        phi = phi.T**degrees
        phi = np.c_[phi, np.ones((x.shape[0], 1))]
    return phi

In [25]:
def poly_regr(x, y, indexes=None, degree=1):
    phi = get_phi(x, indexes, degree)
    w = None
    try :
        w = np.linalg.inv(phi.T.dot(phi)).dot(phi.T.dot(y))
        y_pred = phi.dot(w) 
        rmse = np.sqrt(np.sum((y - y_pred)**2)/y_pred.size)
    except np.linalg.LinAlgError:
        rmse = 100
    
    return (w, rmse)

In [26]:
def get_best_att(x, y, x_test, y_test, nb_att_to_get, deg):
    nb_att_in_x = x[0].shape[1]
    indexes_best_att = np.zeros(nb_att_to_get, dtype=int)
    best_poly_att = np.zeros(nb_att_to_get, dtype=int)
    
    for i in range(nb_att_to_get):
        best_rmse = 2
        best_att = None

        for id_att in range(nb_att_in_x):
            rmse, poly = get_best_poly(x, y, x_test, y_test, deg, id_att)
           
            if rmse < best_rmse and np.argwhere(indexes_best_att==id_att).size == 0:
                best_rmse = rmse
                best_att = id_att
                best_poly_att[i] = poly
        
    
        indexes_best_att[i] = best_att
        
    return indexes_best_att, best_poly_att

In [27]:
def get_best_poly(x, y, x_test, y_test, deg, id_att):
    best_rmse = 2
    best_poly = None
    
    for poly in range(1, deg):
        all_w = np.zeros((poly+1, len(x)))
        
        for i in range(len(x)):
            w, rmse = poly_regr(x[i].T[id_att], y[i], degree=poly)
            all_w.T[i] = w
        
        rmse = get_test_rmse(x_test.T[id_att], y_test, poly, all_w)
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_poly = poly
    
    return best_rmse, best_poly

In [28]:
def get_test_rmse(x, y, deg, all_w):
    phi = get_phi(x, degree=deg)
    y_pred = phi.dot(all_w)
    y_test_pred_avg = np.mean(y_pred, axis=1)
    rmse = np.sqrt(np.sum((y - y_test_pred_avg)**2)/y.size)
    return rmse

In [29]:
def get_all_w(x, y, indexes, poly):
    all_w = np.zeros((np.sum(poly)+1, len(x)))

    for i in range(len(x)):
        w, rmse = poly_regr(x[i], y[i], indexes, poly)
        all_w.T[i] = w
        
    return all_w

In [13]:
deg = 20
nb_att_to_get = 15

for nb_att in range(1, nb_att_to_get):
    indexes, poly = get_best_att(x_train, y_train, x_test, y_test, nb_att, deg)
    print('---------------------------------------------------------')
    print('Polynomials = ', poly)
    all_w = get_all_w(x_train, y_train, indexes, poly)
    phi = get_phi(x_test, indexes, poly)
    y_pred = phi.dot(all_w)
    y_test_pred_avg = np.mean(y_pred, axis=1)
    rmse = np.sqrt(np.sum((y_test - y_test_pred_avg)**2)/y_test.size)
    var_avg = np.mean(np.var(y_pred, axis=1))

    print('---------------------------------------------------------')
    print('Nombre d\'attributs = ', nb_att)
    print('Variance = ', var_avg)
    print('RMSE = ', rmse)
    

---------------------------------------------------------
Polynomials =  [1]
---------------------------------------------------------
Nombre d'attributs =  1
Variance =  0.02179876293750828
RMSE =  0.7851596940676735
---------------------------------------------------------
Polynomials =  [1 2]
---------------------------------------------------------
Nombre d'attributs =  2
Variance =  0.1481884347217635
RMSE =  0.7571214807742919
---------------------------------------------------------
Polynomials =  [1 2 2]
---------------------------------------------------------
Nombre d'attributs =  3
Variance =  0.15379712690270989
RMSE =  0.7403691575825437
---------------------------------------------------------
Polynomials =  [1 2 2 1]
---------------------------------------------------------
Nombre d'attributs =  4
Variance =  0.19932737219577565
RMSE =  0.7255384721167633
---------------------------------------------------------
Polynomials =  [1 2 2 1 1]
--------------------------------

In [30]:
indexes, poly = get_best_att(x_train, y_train, x_test, y_test, 4, 10)
w, rmse = poly_regr(x_n, y_n, indexes, poly)
print(rmse)

0.7079516787816333


In [31]:
def test_data(x, y, all_w, indexes, degree=1):
    phi = get_phi(x, indexes, degree)
    y_pred_n = phi.dot(all_w)
    #y_test_pred_avg = np.mean(y_pred_n, axis=1)
    y_pred = (y.std() * y_pred_n) + y.mean()
    
    return y_pred

In [32]:
y_pred = test_data(test_n, y, w, indexes, poly)

In [33]:
df = pd.DataFrame({"Id": range(y_pred.size), "Salary": y_pred})
compression_opts = dict(method='zip', archive_name='out.csv') 
df.to_csv('sampleSubmission.zip', index=False, compression=compression_opts)

In [219]:
# 0.45847234649673924