In [37]:
#from implementation import *
#from proj1_helpers import *
#from cross_validation import *
from data_preprocessing import *
import pandas as pd 
import csv

TRAIN_PATH = "./data/train.csv"
TEST_PATH = "./data/test.csv"
USE_COLS = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 23, 24, 25, 26, 28, 29, 31)

def make_predictions(weight, degree, name):
    y_test, x_test, ids_test = load_csv_data("./data/test.csv", sub_sample=False)
    x_test, mean_x_test, std_x_test = standardize(x_test)
    tx_poly = build_poly(x_test, degree)
    y_pred = predict_labels(weight, tx_poly)
    create_csv_submission(ids_test, y_pred,  name +".csv")


def calculate_mse(e):
    """Calculate the mse for vector e."""
    return 1/2*np.mean(e**2)


def calculate_mae(e):
    """Calculate the mae for vector e."""
    return np.mean(np.abs(e))


def compute_loss(y, tx, w, mse=True):
    """Calculate the loss.
    """
    e = y - tx.dot(w)
    if mse:
        return calculate_mse(e)
    else:
        return calculate_mae(e)
    
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

def ridge_regression(y, tx, lambda_):
    """implement ridge regression."""
    a = tx.T.dot(tx) + lambda_ * 2 * len(y) * np.identity(tx.shape[1])
    b = tx.T.dot(y)
    w_ridge = np.linalg.solve(a, b)
    return w_ridge

In [38]:
def load_csv_data(data_path, sub_sample=False, cut_values=True):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    if (cut_values):
        print('load_csv_data : dropping uniform distribution values')
        # drop Pri_tau_phi(17), Pri_lep_phi(20), Pri_met_phi(22), Pri_jet_leading_Phi(27), Pri_jet_subleading_phi(30)
        # because of uniform distribution
        x = np.genfromtxt(data_path, delimiter=",", skip_header=1, \
            usecols=USE_COLS)
    else:
        x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
        
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1
    
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids

In [39]:
def load_data():
    print('Split_data_according_to_jet')
    print('Loading files...')
    y_tr, x_tr, ids_tr = load_csv_data(TRAIN_PATH)
    y_te, x_te, ids_te = load_csv_data(TEST_PATH)
    return y_tr, x_tr, ids_tr, y_te, x_te, ids_te

def load_headers():
    """Load all the headers from the training file and drop the unnecessary ones"""
    with open(TRAIN_PATH) as train_file:
        reader = csv.reader(train_file)
        headers = next(reader)
    
    # Only use the columns in USE_COLS
    headers = [headers[i] for i in USE_COLS]
    # drop ID and Predictions cols
    headers = headers[2:]
    
    return headers

In [40]:
y_tr, x_tr, ids_tr, y_te, x_te, ids_te = load_data()

Split_data_according_to_jet
Loading files...
load_csv_data : dropping uniform distribution values
load_csv_data : dropping uniform distribution values


In [41]:
headers = load_headers()
headers

['DER_mass_MMC',
 'DER_mass_transverse_met_lep',
 'DER_mass_vis',
 'DER_pt_h',
 'DER_deltaeta_jet_jet',
 'DER_mass_jet_jet',
 'DER_prodeta_jet_jet',
 'DER_deltar_tau_lep',
 'DER_pt_tot',
 'DER_sum_pt',
 'DER_pt_ratio_lep_tau',
 'DER_met_phi_centrality',
 'DER_lep_eta_centrality',
 'PRI_tau_pt',
 'PRI_tau_eta',
 'PRI_lep_pt',
 'PRI_lep_eta',
 'PRI_met',
 'PRI_met_sumet',
 'PRI_jet_num',
 'PRI_jet_leading_pt',
 'PRI_jet_leading_eta',
 'PRI_jet_subleading_pt',
 'PRI_jet_subleading_eta',
 'PRI_jet_all_pt']

In [42]:
def remove_outlier_in_DER_pt_h(y_tr, x_tr, ids_tr, jet):
    print("Removing outliers in Der_pt_h")
    # Remove the outliers in DER_pt_h (col 3):
    #  JET 0: 2834.999 when the max value is 117.707 outside of outlier - threshold to 120 
    #  JET 2: 1053.807 when max value is 734 outside of outlier- Threshold to 800
    OUTLIERS = [120, 999, 800, 999]
    outlier = OUTLIERS[jet]
    tr_smaller_than_outlier = (x_tr[:, 3] < outlier)
    x_tr = x_tr[tr_smaller_than_outlier]
    y_tr = y_tr[tr_smaller_than_outlier]
    ids_tr = ids_tr[tr_smaller_than_outlier]
    return y_tr, x_tr, ids_tr

In [43]:
def remove_all_NAN_columns(x_tr, x_te, headers_jet):
    print("Removing nan columns")
    nan_cols = []
    # Find all columns with -999
    for col_idx in range(x_tr.shape[1]):
        col = x_tr[:, col_idx]
        nb_nan_in_col = len(x_tr[col == -999])
        # A column has all NaN if len of col = nb NaN values in col
        if (nb_nan_in_col == len(col)):
            nan_cols.append(col_idx)
    
    # Remove all nan columns
    x_tr_updated = np.delete(x_tr, nan_cols, axis=1)
    x_te_updated = np.delete(x_te, nan_cols, axis=1)
    headers_jet_updated = np.delete(headers_jet, nan_cols)
    
    return x_tr_updated, x_te_updated, headers_jet_updated    

In [44]:
def split_data_according_to_mass(x, y, ids):
        # Get all the rows idx with invalid mass (i.e. DER_mass_MMC = -999)
        invalid_mass_row_idx = x[:, 0] == -999
        valid_mass_row_idx = x[:, 0] > 0
        # Process for each data table
        x_invalid_mass = x[invalid_mass_row_idx]
        x_valid_mass = x[valid_mass_row_idx]
        y_invalid_mass = y[invalid_mass_row_idx]
        y_valid_mass = y[valid_mass_row_idx]
        ids_invalid_mass = ids[invalid_mass_row_idx]
        ids_valid_mass = ids[valid_mass_row_idx]
        
        return x_invalid_mass, x_valid_mass, y_invalid_mass, y_valid_mass, ids_invalid_mass, ids_valid_mass 
    

In [45]:
def output_to_csv(x, y, ids, headers, jet, isTrain, isMassValid):
    """
    Write data into new csv file
    """
    # Add 'Id' & 'Prediction' to headers
    headers = np.insert(headers, 0, ['Id', 'Prediction'])
    
    # Remove 'DER_mass_MMC' if mass is not valid
    if not isMassValid:
        headers = np.delete(headers, np.where(headers =='DER_mass_MMC'))
    
    # Generate file name
    base = './data/train_' if isTrain else './data/test_'
    valid = '_valid_mass' if isMassValid else '_invalid_mass'
    file_name = base + 'jet_' + str(jet) + valid + '.csv'
    
    print("Outputing {}".format(file_name))
        
    with open(file_name, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()
        data_ = dict.fromkeys(headers)
        # Transform -1 and 1 into 's' and 'b'
        for id_, y_, x_ in zip(ids, y, x):
            data_['Id'] = int(id_)
            if (y_ != -1 and y_ !=1):
                raise Exception('Prediction not -1 and 1!!!')
            data_['Prediction'] = 's' if y_ == 1 else 'b'
            
            for idx, x_value in enumerate(x_):
                data_[headers[idx + 2]] = float(x_value)
            writer.writerow(data_)


In [46]:
def split_data_according_to_jet_and_mass(y_tr, x_tr, ids_tr, y_te, x_te, ids_te, headers): 
    for jet in range(4):
        print("\n\nSplitting for jet {}".format(jet))

        # PRI_jet_num (24 -> 24 - 3 cols dropped before 24 - 2 cols (id, label) = col 19)
        col_jet = 19
        
        # TRAIN - Get all the rows having Pri_jet_num = jet for TRAINING set and delete PRI_jet_num col
        x_tr_jet = x_tr[x_tr[:, col_jet] == jet]
        x_tr_jet = np.delete(x_tr_jet, col_jet, axis=1)
        # Delete PRI_jet_num in headers
        headers_jet = np.delete(headers, col_jet)

        # Using the row found in x_tr to select the rows in y and ids
        y_tr_jet = y_tr[x_tr[:, col_jet] == jet]
        ids_tr_jet = ids_tr[x_tr[:, col_jet] == jet]
        
        # TEST - Get all the rows having Pri_jet_num = jet for TEST set and delete PRI_jet_num col
        x_te_jet = x_te[x_te[:, col_jet] == jet]
        x_te_jet = np.delete(x_te_jet, col_jet, axis=1)
                
        # Using the row found in x_tr to select the rows in y and ids
        y_te_jet = y_te[x_te[:, col_jet] == jet]
        ids_te_jet = ids_te[x_te[:, col_jet] == jet]
            
        # Remove outliers
        y_tr_jet, x_tr_jet, ids_tr_jet = remove_outlier_in_DER_pt_h(y_tr_jet, x_tr_jet, ids_tr_jet, jet)

        # Remove col PRI_jet_all_pt from x because it only contains 0 values
        if jet == 0:
            print("Deleted col Pri_jet_all_pt in set with jet_num = 0")
            x_tr_jet = np.delete(x_tr_jet, -1, axis=1)
            x_te_jet = np.delete(x_te_jet, -1, axis=1)
            headers_jet = np.delete(headers_jet, -1)
        
        # Remove all the columns with only NaN values
        x_tr_jet, x_te_jet, headers_jet = remove_all_NAN_columns(x_tr_jet, x_te_jet, headers_jet)

        # Split the dataset again into valid/invalid values of DER_mass_MMC 
        # TRAIN
        x_tr_jet_invalid_mass, x_tr_jet_valid_mass, y_tr_jet_invalid_mass, y_tr_jet_valid_mass, ids_tr_jet_invalid_mass, ids_tr_jet_valid_mass = split_data_according_to_mass(x_tr_jet, y_tr_jet, ids_tr_jet)
        # TEST
        x_te_jet_invalid_mass, x_te_jet_valid_mass, y_te_jet_invalid_mass, y_te_jet_valid_mass, ids_te_jet_invalid_mass, ids_te_jet_valid_mass = split_data_according_to_mass(x_te_jet, y_te_jet, ids_te_jet)
        
        # Remove 'DER_mass_MMC' (col 0) if the mass is not valid
        x_tr_jet_invalid_mass = np.delete(x_tr_jet_invalid_mass, 0, axis=1)
        x_te_jet_invalid_mass = np.delete(x_te_jet_invalid_mass, 0, axis=1)
        
        # Save into CSV
        #x, y, ids, headers, jet, isTrain, isMassValid
        # TRAIN
        output_to_csv(x_tr_jet_invalid_mass, y_tr_jet_invalid_mass, ids_tr_jet_invalid_mass, headers_jet, jet, True, False)
        output_to_csv(x_tr_jet_valid_mass, y_tr_jet_valid_mass, ids_tr_jet_valid_mass, headers_jet, jet, True, True)

        # TEST
        output_to_csv(x_te_jet_invalid_mass, y_te_jet_invalid_mass, ids_te_jet_invalid_mass, headers_jet, jet, False, False)
        output_to_csv(x_te_jet_valid_mass, y_te_jet_valid_mass, ids_te_jet_valid_mass, headers_jet, jet, False, True)


In [None]:
split_data_according_to_jet_and_mass(y_tr, x_tr, ids_tr, y_te, x_te, ids_te, headers)


In [47]:
# Read the real files
def generate_processed_filenames(isTrain):
    file_names = []
    isMassValids = [True, False]
    jets = range(4)
    
    for isMassValid in isMassValids:
        for jet in jets:
            # Generate file name
            base = './data/train_' if isTrain else './data/test_'
            valid = '_valid_mass' if isMassValid else '_invalid_mass'
            file_name = base + 'jet_' + str(jet) + valid + '.csv'
            file_names.append(file_name)
                
    return file_names

file_names = generate_processed_filenames(True)
file_names

['./data/train_jet_0_valid_mass.csv',
 './data/train_jet_1_valid_mass.csv',
 './data/train_jet_2_valid_mass.csv',
 './data/train_jet_3_valid_mass.csv',
 './data/train_jet_0_invalid_mass.csv',
 './data/train_jet_1_invalid_mass.csv',
 './data/train_jet_2_invalid_mass.csv',
 './data/train_jet_3_invalid_mass.csv']

In [48]:
def load_processed_data(isTrain):
    """Load all Train/Test processed data"""
    file_names = generate_processed_filenames(isTrain)

    ys = []
    xs = []
    ids = []

    for i in range(4):
        y, x, id_ = load_csv_data(file_names[i], cut_values = False)
        ys.append(y)
        xs.append(x)
        ids.append(id_)
        
    return ys, xs, ids

In [49]:
ys_train, xs_train, ids_train = load_processed_data(True)

In [50]:
def standardize(x, mean=None, std=None):
    """Standardize the original data set."""
    if mean is None or std is None:
        mean_x = np.mean(x, axis = 0)
        std_x = np.std(x, axis = 0)
    else:
        mean_x = mean
        std_x = std
        
    x = x - mean_x
    x = x / std_x 

    return x, mean_x, std_x

In [51]:
# Standandize xs
x_means = []
x_stds = []
x_standardized = []

for x_ in xs_train:
    x, mean_x, std_x = standardize(x_)
    x_standardized.append(x)
    x_means.append(mean_x)
    x_stds.append(std_x)

In [94]:
def generate_lambdas(start, end, deg):
    if isinstance(start, float) or isinstance(end, float):
        return np.linspace(start, end, deg)
    else :
        index = 0
        lambdas = np.zeros(deg * (end - start + 1))
        for deg_ in range(1, deg + 1):
            for i in range(start, end + 1):
                lambda_ = i * (10 ** int(-deg_))
                lambdas[index] = lambda_
                index = index + 1
                print(i, " to the ", (10 ** int(-deg_)), " : ", '{0:.20f}'.format(lambda_))
        return lambdas

generate_lambdas(1, 9, 12)

1  to the  0.1  :  0.10000000000000000555
2  to the  0.1  :  0.20000000000000001110
3  to the  0.1  :  0.30000000000000004441
4  to the  0.1  :  0.40000000000000002220
5  to the  0.1  :  0.50000000000000000000
6  to the  0.1  :  0.60000000000000008882
7  to the  0.1  :  0.70000000000000006661
8  to the  0.1  :  0.80000000000000004441
9  to the  0.1  :  0.90000000000000002220
1  to the  0.01  :  0.01000000000000000021
2  to the  0.01  :  0.02000000000000000042
3  to the  0.01  :  0.02999999999999999889
4  to the  0.01  :  0.04000000000000000083
5  to the  0.01  :  0.05000000000000000278
6  to the  0.01  :  0.05999999999999999778
7  to the  0.01  :  0.07000000000000000666
8  to the  0.01  :  0.08000000000000000167
9  to the  0.01  :  0.08999999999999999667
1  to the  0.001  :  0.00100000000000000002
2  to the  0.001  :  0.00200000000000000004
3  to the  0.001  :  0.00300000000000000006
4  to the  0.001  :  0.00400000000000000008
5  to the  0.001  :  0.00500000000000000010
6  to the  0.00

array([1.e-01, 2.e-01, 3.e-01, 4.e-01, 5.e-01, 6.e-01, 7.e-01, 8.e-01,
       9.e-01, 1.e-02, 2.e-02, 3.e-02, 4.e-02, 5.e-02, 6.e-02, 7.e-02,
       8.e-02, 9.e-02, 1.e-03, 2.e-03, 3.e-03, 4.e-03, 5.e-03, 6.e-03,
       7.e-03, 8.e-03, 9.e-03, 1.e-04, 2.e-04, 3.e-04, 4.e-04, 5.e-04,
       6.e-04, 7.e-04, 8.e-04, 9.e-04, 1.e-05, 2.e-05, 3.e-05, 4.e-05,
       5.e-05, 6.e-05, 7.e-05, 8.e-05, 9.e-05, 1.e-06, 2.e-06, 3.e-06,
       4.e-06, 5.e-06, 6.e-06, 7.e-06, 8.e-06, 9.e-06, 1.e-07, 2.e-07,
       3.e-07, 4.e-07, 5.e-07, 6.e-07, 7.e-07, 8.e-07, 9.e-07, 1.e-08,
       2.e-08, 3.e-08, 4.e-08, 5.e-08, 6.e-08, 7.e-08, 8.e-08, 9.e-08,
       1.e-09, 2.e-09, 3.e-09, 4.e-09, 5.e-09, 6.e-09, 7.e-09, 8.e-09,
       9.e-09, 1.e-10, 2.e-10, 3.e-10, 4.e-10, 5.e-10, 6.e-10, 7.e-10,
       8.e-10, 9.e-10, 1.e-11, 2.e-11, 3.e-11, 4.e-11, 5.e-11, 6.e-11,
       7.e-11, 8.e-11, 9.e-11, 1.e-12, 2.e-12, 3.e-12, 4.e-12, 5.e-12,
       6.e-12, 7.e-12, 8.e-12, 9.e-12])

In [93]:
np.linspace(0.00000000000800000000, 0.00000000000900000000, 50)

array([8.00e-12, 8.05e-12, 8.10e-12, 8.15e-12, 8.20e-12, 8.25e-12,
       8.30e-12, 8.35e-12, 8.40e-12, 8.45e-12, 8.50e-12, 8.55e-12,
       8.60e-12, 8.65e-12, 8.70e-12, 8.75e-12, 8.80e-12, 8.85e-12,
       8.90e-12, 8.95e-12, 9.00e-12])

In [5]:
# Initialize a weights dictionary for each jet
weights_jet = dict.fromkeys(range(4))
degrees_jet = dict.fromkeys(range(4))
lambdas_jet = dict.fromkeys(range(4))

# build w using ridge regression
k_fold = 10
degrees = np.arange(5, 13)
lambdas = generate_lambdas(13)
seed = 1

NameError: name 'np' is not defined

In [82]:
for i, l in enumerate(lambdas):
    print(i, '{0:.20f}'.format(l))

0 0.10000000000000000555
1 0.20000000000000001110
2 0.30000000000000004441
3 0.40000000000000002220
4 0.50000000000000000000
5 0.60000000000000008882
6 0.70000000000000006661
7 0.80000000000000004441
8 0.90000000000000002220
9 0.01000000000000000021
10 0.02000000000000000042
11 0.02999999999999999889
12 0.04000000000000000083
13 0.05000000000000000278
14 0.05999999999999999778
15 0.07000000000000000666
16 0.08000000000000000167
17 0.08999999999999999667
18 0.00100000000000000002
19 0.00200000000000000004
20 0.00300000000000000006
21 0.00400000000000000008
22 0.00500000000000000010
23 0.00600000000000000012
24 0.00700000000000000015
25 0.00800000000000000017
26 0.00900000000000000105
27 0.00010000000000000000
28 0.00020000000000000001
29 0.00030000000000000003
30 0.00040000000000000002
31 0.00050000000000000001
32 0.00060000000000000006
33 0.00069999999999999999
34 0.00080000000000000004
35 0.00090000000000000008
36 0.00001000000000000000
37 0.00002000000000000000
38 0.00003000000000000

In [55]:
def percentage_of_wrongness(y, x, w):
    """ return the percentage of right prediction"""
    y_pred = np.dot(x, w)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    wrong = np.sum(y_pred != y)
    wrongness = float(wrong) / float(y.shape[0])
    
    return wrongness

In [99]:
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    # form data with polynomial degree
    tx_tr = build_poly(x_tr, degree)
    tx_te = build_poly(x_te, degree)
    # ridge regression
    w = ridge_regression(y_tr, tx_tr, lambda_)
    # calculate the loss for train and test data
    #loss_tr = np.sqrt(2 * compute_loss(y_tr, tx_tr, w))
    #loss_te = np.sqrt(2 * compute_loss(y_te, tx_te, w))
    loss_tr = percentage_of_wrongness(y_tr, tx_tr, w)
    loss_te = percentage_of_wrongness(y_te, tx_te, w)
    return loss_tr, loss_te, w

def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

In [4]:
def finding_best_lambdas(y, tx, k_indices, lambdas, degree):
    # for each degree, we compute the best lambdas and the associated error
    print("New turn of cv for degree {}".format(degree))
    errors = []
    for lambda_ in lambdas:
        errors_temp = []
        w_te_temp = []
        for k in range(k_fold):
            _, loss_te, w = cross_validation(y, tx, k_indices, k, lambda_, degree)
            errors_temp.append(loss_te)
        
        #print("For lambda: {}, loss_te_mean: {}, loss_te_median: {}".format(lambda_, np.mean(errors_temp), np.median(errors_temp)))
        errors.append(np.median(errors_temp))

    ind_lambda_opt = np.argmin(errors)
    best_lambda = lambdas[ind_lambda_opt]
    least_error = errors[ind_lambda_opt]
    print("For degree {deg}, te_loss={te_loss}, lambda={lambda_}".format(deg=degree, te_loss=least_error, lambda_=best_lambda))
    return ind_lambda_opt, best_lambda, least_error


In [None]:

# Train with ridge regression for each tx in x and save the weights
for tx, y, f in zip(x_standardized, ys_train, file_names):
    print("Training for {}".format(f))
    
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    best_lambdas = []
    best_errors = []
    # vary degree
    for degree in degrees:
        ind_lambda_opt, _, _ = finding_best_lambdas(y, tx, k_indices, lambdas, degree)
        profound_lambdas = lambdas.copy()
        
        for i in range(3):
            print("For digit", i)
            
            if ind_lambda_opt == 0:
                previous_lambda = profound_lambdas[ind_lambda_opt]
                next_lambda = profound_lambdas[ind_lambda_opt + 1]
                profound_lambdas = generate_lambdas(previous_lamdba, next_lambda, 11)

            elif ind_lambda_opt == len(profound_lambdas) - 1:
                previous_lambda = profound_lambdas[ind_lambda_opt - 1]
                next_lambda = profound_lambdas[ind_lambda_opt]
                profound_lambdas = generate_lambdas(previous_lamdba, next_lambda, 11)

            else:
                previous_lamdba = profound_lambdas[ind_lambda_opt - 1]
                next_lambda = profound_lambdas[ind_lambda_opt + 1]
                profound_lambdas = generate_lambdas(previous_lamdba, next_lambda, 21)

            print("Generated new lambdas with {0:.16f}".format(previous_lamdba) + " and {0:.16f}\n".format(next_lambda))

            ind_lambda_opt, best_lambda, best_error = finding_best_lambdas(y, tx, k_indices, profound_lambdas, degree)
            
            best_lambdas.append(best_lambda)
            best_errors.append(best_error)
    

    # find the one having the least test error
    ind_best_degree = np.argmin(best_errors)

    best_degree = degrees[ind_best_degree]
    print("\nBest degree:", best_degree)
    best_lambda = best_lambdas[ind_best_degree]
    print("Best lambda:", best_lambda)
    
    tx_extended = build_poly(tx, best_degree)
    w_star = ridge_regression(y, tx, best_lambda)
    
    print("Wrong prediction : {} \n\n", perc_wrong_pred(y, tx, w_star))

    # record the weights & degree
    degrees_jet[f] = best_degree
    lambdas_jet[f] = best_lambda

    

Training for ./data/train_jet_0_valid_mass.csv
New turn of cv for degree 8

For degree 8, te_loss=0.19198969910544864, lambda=1e-07


For digit 0
Generated new lambdas with 0.0000090000000000 and 0.0000002000000000


New turn of cv for degree 8

For degree 8, te_loss=0.19198969910544864, lambda=2e-07


For digit 1
Generated new lambdas with 0.0020000000000000 and 0.0040000000000000


New turn of cv for degree 8

For degree 8, te_loss=0.19280292762266196, lambda=0.0021


For digit 2
Generated new lambdas with 0.1000000000000000 and 0.3000000000000000


New turn of cv for degree 8


KeyboardInterrupt: 

In [None]:
from plots import cross_validation_visualization

def cross_validation_demo():
    seed = 1
    degree = 7
    k_fold = 4
    lambdas = np.logspace(-4, 0, 30)
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    degree = 7
    for lambda_ in lambdas:
        loss_tr = 0
        loss_te = 0
        for idx in range(k_fold):
            e_tr, e_te = cross_validation(y, x, k_indices, idx, lambda_, degree)
            loss_tr += e_tr
            loss_te += e_te

        rmse_tr.append(loss_tr/k_fold)
        rmse_te.append(loss_te/k_fold)

    cross_validation_visualization(lambdas, rmse_tr, rmse_te)
    return rmse_tr, rmse_te

    

a, b = cross_validation_demo()