In [192]:
import pandas as pd
import numpy as np

train_set = pd.read_csv(r'train.csv')
test_set = pd.read_csv(r'test.csv')
x = train_set.drop(columns=["Id", "Salary", "DOB", "PIMGP"]).to_numpy()
y = train_set.loc[:, ["Salary"]].to_numpy()[:,0]
test = test_set.drop(columns=["Id", "DOB", "PIMGP"]).to_numpy()

In [193]:
x_n = (x - x.mean(0)) / x.std(0)
y_n = (y - y.mean()) / y.std()
test_n = (test - test.mean(0)) / test.std(0)

In [194]:
x_train = np.array_split(x_n, 6)
x_test = x_train[4]
x_eval = x_train[5]
x_train = np.delete(x_train, [4,5], 0)

y_train = np.array_split(y_n, 6)
y_test = y_train[4]
y_eval = y_train[5]
y_train = np.delete(y_train, [4,5], 0)

In [195]:
x_tr = np.concatenate(x_train, axis=0)
x_tr = np.concatenate((x_tr, x_test), axis=0)
y_tr = np.concatenate(y_train)
y_tr = np.concatenate((y_tr, y_test))

In [196]:
def get_phi(x, indexes=None, degree=1):
    if indexes is None:
        degrees = np.arange(degree+1)
        phi = np.tile(x, [degree+1,1]).T
        phi = phi**degrees
    else:
        phi = np.repeat(x.T[indexes], degree, axis=0)
        degrees = []
        for poly in degree:
            degrees = np.append(degrees, np.linspace(1, poly, poly, dtype=int))
        phi = phi.T**degrees
        phi = np.c_[phi, np.ones((x.shape[0], 1))]
    return phi

In [197]:
def poly_regr(x, y, indexes=None, degree=1):
    phi = get_phi(x, indexes, degree)
    w = None
    try :
        w = np.linalg.inv(phi.T.dot(phi)).dot(phi.T.dot(y))
        y_pred = phi.dot(w) 
        rmse = np.sqrt(np.sum((y - y_pred)**2)/y_pred.size)
    except np.linalg.LinAlgError:
        rmse = 100
    
    return (w, rmse)

In [198]:
def get_best_att(x, y, x_test, y_test, nb_att_to_get, deg):
    nb_att_in_x = x[0].shape[1]
    indexes_best_att = np.zeros(nb_att_to_get, dtype=int)
    best_poly_att = np.zeros(nb_att_to_get, dtype=int)
    
    for i in range(nb_att_to_get):
        best_rmse = 2
        best_att = None

        for id_att in range(nb_att_in_x):
            rmse, poly = get_best_poly(x, y, x_test, y_test, deg, id_att)
           
            if rmse < best_rmse and np.argwhere(indexes_best_att==id_att).size == 0:
                best_rmse = rmse
                best_att = id_att
                best_poly_att[i] = poly
        
    
        indexes_best_att[i] = best_att
        
    return indexes_best_att, best_poly_att

In [199]:
def get_best_poly(x, y, x_test, y_test, deg, id_att):
    best_rmse = 2
    best_poly = None
    
    for poly in range(1, deg):
        all_w = np.zeros((poly+1, len(x)))
        
        for i in range(len(x)):
            w, rmse = poly_regr(x[i].T[id_att], y[i], degree=poly)
            all_w.T[i] = w
        
        rmse = get_test_rmse(x_test.T[id_att], y_test, poly, all_w)
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_poly = poly
            
    return best_rmse, best_poly

In [200]:
def get_test_rmse(x, y, deg, all_w):
    phi = get_phi(x, degree=deg)
    y_pred = phi.dot(all_w)
    y_test_pred_avg = np.mean(y_pred, axis=1)
    rmse = np.sqrt(np.sum((y - y_test_pred_avg)**2)/y.size)
    return rmse

In [201]:
def get_all_w(x, y, indexes, poly):
    all_w = np.zeros((np.sum(poly)+1, len(x)))

    for i in range(len(x)):
        w, rmse = poly_regr(x[i], y[i], indexes, poly)
        all_w.T[i] = w
        
    return all_w

In [220]:
deg = 4
nb_att_to_get = 15

for nb_att in range(1, nb_att_to_get):
    indexes, poly = get_best_att(x_train, y_train, x_test, y_test, nb_att, deg)
    print('---------------------------------------------------------')
    print('Polynomials = ', poly)
    all_w = get_all_w(x_train, y_train, indexes, poly)
    phi = get_phi(x_eval, indexes, poly)
    y_pred = phi.dot(all_w)
    y_test_pred_avg = np.mean(y_pred, axis=1)
    rmse = np.sqrt(np.sum((y_eval - y_test_pred_avg)**2)/y_test.size)
    var_avg = np.mean(np.var(y_pred, axis=1))

    print('---------------------------------------------------------')
    print('Nombre d\'attributs = ', nb_att)
    print('Variance = ', var_avg)
    print('RMSE = ', rmse)
    

---------------------------------------------------------
Polynomials =  [2]
---------------------------------------------------------
Nombre d'attributs =  1
Variance =  0.0035729495641492904
RMSE =  0.989023701026207
---------------------------------------------------------
Polynomials =  [2 1]
---------------------------------------------------------
Nombre d'attributs =  2
Variance =  0.00963704873481624
RMSE =  0.8164754783570195
---------------------------------------------------------
Polynomials =  [2 1 2]
---------------------------------------------------------
Nombre d'attributs =  3
Variance =  0.035809249064850904
RMSE =  0.7896241136111242
---------------------------------------------------------
Polynomials =  [2 1 2 3]
---------------------------------------------------------
Nombre d'attributs =  4
Variance =  0.07469111563727321
RMSE =  0.7645027617509824
---------------------------------------------------------
Polynomials =  [2 1 2 3 1]
-----------------------------

In [226]:
def get_best_att_combination(x_train, y_train, x_test, y_test, indexes, poly):
    nb_colums = len(indexes)
    best_rmse = 2
    best_rmse_var = None
    best_ids = None
    best_var = 2
    best_var_rmse = None
    best_ids_var = None
    
    for id_first in range(nb_colums):
        for id_second in range(nb_colums):
            if id_second != id_first:
                for id_third in range(nb_colums):
                    if id_third != id_second and id_third != id_first:
                        for id_fourth in range(nb_colums):
                            if id_fourth != id_third and id_fourth != id_second and id_fourth != id_first:
                                ids = [id_first, id_second, id_third, id_fourth]
                                all_w = get_all_w(x_train, y_train, indexes[ids], poly[ids])
                                phi = get_phi(x_test, indexes[ids], poly[ids])
                                y_pred = phi.dot(all_w)
                                y_test_pred_avg = np.mean(y_pred, axis=1)
                                var_avg = np.mean(np.var(y_pred, axis=1))
                                
                                rmse = np.sqrt(np.sum((y_test - y_test_pred_avg)**2)/y_test.size)
                                if var_avg < best_var:
#                                     print('--------------------------Best Variance---------------------------')
#                                     print('Variance =', var_avg)
#                                     print('RMSE =', rmse)
                                    best_var = var_avg
                                    best_var_rmse = rmse
                                    best_ids_var = ids
#                                 w, rmse = poly_regr(x_train, y_train, indexes[ids], poly[ids])
#                                 phi = get_phi(x_test, indexes[ids], poly[ids])
#                                 y_pred = phi.dot(w)
#                                 rmse = np.sqrt(np.sum((y_test - y_pred)**2)/y_test.size)
                                if rmse < best_rmse:
#                                     print('--------------------------Best RMSE---------------------------')
#                                     print('Variance =', var_avg)
#                                     print('RMSE =', rmse)
                                    best_rmse = rmse
                                    best_rmse_var = var_avg
                                    best_ids = ids
    
    print('Best RMSE =', best_rmse)
    print('Variance du best RMSE =', best_rmse_var)
    print('RMSE de la meilleure variance =', best_var_rmse)
    print('Meilleure variance =',best_var)
    return best_ids, best_ids_var

In [227]:
indexes, poly = get_best_att(x_train, y_train, x_test, y_test, 10, 4)
print(indexes)
print(poly)

[14  2 13  6  8  1  5 12 11  7]
[2 1 2 3 1 1 1 3 3 1]


In [228]:
ids, ids_var = get_best_att_combination(x_train, y_train, x_test, y_test, indexes, poly)
print(ids)

Best RMSE = 0.45401100811318884
Variance du best RMSE = 0.0894762355713221
RMSE de la meilleure variance = 0.47824079106949235
Meilleure variance = 0.016141735910692036
[5, 6, 0, 8]


In [229]:
all_w = get_all_w(x_train, y_train, indexes[ids], poly[ids])
phi = get_phi(x_eval, indexes[ids], poly[ids])
y_pred = phi.dot(all_w)
y_test_pred_avg = np.mean(y_pred, axis=1)
var_avg = np.mean(np.var(y_pred, axis=1))
print('Variance =', var_avg)
rmse = np.sqrt(np.sum((y_eval - y_test_pred_avg)**2)/y_test.size)
print('RMSE =', rmse)

w, rmse = poly_regr(x_tr, y_tr, indexes[ids], poly[ids])
y_pred = phi.dot(w)
rmse = np.sqrt(np.sum((y_eval - y_pred)**2)/y_test.size)
print('RMSE without cross-validation =', rmse)

Variance = 0.059552707160040644
RMSE = 0.7517990169789897
RMSE without cross-validation = 0.755416031033919


In [None]:
# Variance = 0.059552707160040644
# RMSE = 0.7517990169789897
# RMSE without cross-validation = 0.755416031033919

# Pas de limitation
# Variance = 2.815481153482154
# RMSE = 1.4061630209343945
# RMSE without cross-validation = 0.7808409916523522

In [232]:
all_w = get_all_w(x_train, y_train, indexes[ids_var], poly[ids_var])
phi = get_phi(x_eval, indexes[ids_var], poly[ids_var])
y_pred = phi.dot(all_w)
y_test_pred_avg = np.mean(y_pred, axis=1)
var_avg = np.mean(np.var(y_pred, axis=1))
print('Variance =', var_avg)
rmse = np.sqrt(np.sum((y_eval - y_test_pred_avg)**2)/y_test.size)
print('RMSE =', rmse)

w, rmse = poly_regr(x_tr, y_tr, indexes[ids_var], poly[ids_var])
y_pred = phi.dot(w)
rmse = np.sqrt(np.sum((y_eval - y_pred)**2)/y_test.size)
print('RMSE without cross-validation =', rmse)

Variance = 0.01963166663906059
RMSE = 0.7555692580353129
RMSE without cross-validation = 0.741282691131884


In [None]:
# Variance = 0.01963166663906059
# RMSE = 0.7555692580353129
# RMSE without cross-validation = 0.741282691131884

# Pas de limitation
# Variance = 0.050255402411157994
# RMSE = 0.7841425385895091
# RMSE without cross-validation = 0.7781391605717576

In [233]:
w, rmse = poly_regr(x_n, y_n, indexes[ids_var], poly[ids_var])

In [234]:
def test_data(x, y, all_w, indexes, degree=1):
    phi = get_phi(x, indexes, degree)
    y_pred_n = phi.dot(all_w)
    #y_test_pred_avg = np.mean(y_pred_n, axis=1)
    y_pred = (y.std() * y_pred_n) + y.mean()
    
    #var_avg = np.mean(np.var(y_pred, axis=1))
    #print(var_avg)
    
    return y_pred

In [235]:
y_pred = test_data(test_n, y, w, indexes[ids_var], poly[ids_var])

In [236]:
df = pd.DataFrame({"Id": range(y_pred.size), "Salary": y_pred})
compression_opts = dict(method='zip', archive_name='out.csv') 
df.to_csv('sampleSubmission.zip', index=False, compression=compression_opts)