In [14]:
#from implementation import *
#from proj1_helpers import *
#from cross_validation import *
from data_preprocessing import *
import pandas as pd 
import csv

TRAIN_PATH = "./data/train.csv"
TEST_PATH = "./data/test.csv"
USE_COLS = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 23, 24, 25, 26, 28, 29, 31)

def make_predictions(weight, degree, name):
    y_test, x_test, ids_test = load_csv_data("./data/test.csv", sub_sample=False)
    x_test, mean_x_test, std_x_test = standardize(x_test)
    tx_poly = build_poly(x_test, degree)
    y_pred = predict_labels(weight, tx_poly)
    create_csv_submission(ids_test, y_pred,  name +".csv")


def calculate_mse(e):
    """Calculate the mse for vector e."""
    return 1/2*np.mean(e**2)


def calculate_mae(e):
    """Calculate the mae for vector e."""
    return np.mean(np.abs(e))


def compute_loss(y, tx, w, mse=True):
    """Calculate the loss.
    """
    e = y - tx.dot(w)
    if mse:
        return calculate_mse(e)
    else:
        return calculate_mae(e)
    
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

def ridge_regression(y, tx, lambda_):
    """implement ridge regression."""
    a = tx.T.dot(tx) + lambda_ * 2 * len(y) * np.identity(tx.shape[1])
    b = tx.T.dot(y)
    w_ridge = np.linalg.solve(a, b)
    return w_ridge

In [15]:
def load_csv_data(data_path, sub_sample=False, cut_values=True):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    if (cut_values):
        print('load_csv_data : dropping uniform distribution values')
        # drop Pri_tau_phi(17), Pri_lep_phi(20), Pri_met_phi(22), Pri_jet_leading_Phi(27), Pri_jet_subleading_phi(30)
        # because of uniform distribution
        x = np.genfromtxt(data_path, delimiter=",", skip_header=1, \
            usecols=USE_COLS)
    else:
        x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
        
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1
    
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids

In [16]:
def load_data():
    print('Split_data_according_to_jet')
    print('Loading files...')
    y_tr, x_tr, ids_tr = load_csv_data(TRAIN_PATH)
    y_te, x_te, ids_te = load_csv_data(TEST_PATH)
    return y_tr, x_tr, ids_tr, y_te, x_te, ids_te

def load_headers():
    """Load all the headers from the training file and drop the unnecessary ones"""
    with open(TRAIN_PATH) as train_file:
        reader = csv.reader(train_file)
        headers = next(reader)
    
    # Only use the columns in USE_COLS
    headers = [headers[i] for i in USE_COLS]
    # drop ID and Predictions cols
    headers = headers[2:]
    
    return headers

In [12]:
y_tr, x_tr, ids_tr, y_te, x_te, ids_te = load_data()

Split_data_according_to_jet
Loading files...
load_csv_data : dropping uniform distribution values
load_csv_data : dropping uniform distribution values


In [17]:
headers = load_headers()
headers

['DER_mass_MMC',
 'DER_mass_transverse_met_lep',
 'DER_mass_vis',
 'DER_pt_h',
 'DER_deltaeta_jet_jet',
 'DER_mass_jet_jet',
 'DER_prodeta_jet_jet',
 'DER_deltar_tau_lep',
 'DER_pt_tot',
 'DER_sum_pt',
 'DER_pt_ratio_lep_tau',
 'DER_met_phi_centrality',
 'DER_lep_eta_centrality',
 'PRI_tau_pt',
 'PRI_tau_eta',
 'PRI_lep_pt',
 'PRI_lep_eta',
 'PRI_met',
 'PRI_met_sumet',
 'PRI_jet_num',
 'PRI_jet_leading_pt',
 'PRI_jet_leading_eta',
 'PRI_jet_subleading_pt',
 'PRI_jet_subleading_eta',
 'PRI_jet_all_pt']

In [18]:
def remove_outlier_in_DER_pt_h(y_tr, x_tr, ids_tr, jet):
    print("Removing outliers in Der_pt_h")
    # Remove the outliers in DER_pt_h (col 3):
    #  JET 0: 2834.999 when the max value is 117.707 outside of outlier - threshold to 120 
    #  JET 2: 1053.807 when max value is 734 outside of outlier- Threshold to 800
    OUTLIERS = [120, 999, 800, 999]
    outlier = OUTLIERS[jet]
    tr_smaller_than_outlier = (x_tr[:, 3] < outlier)
    x_tr = x_tr[tr_smaller_than_outlier]
    y_tr = y_tr[tr_smaller_than_outlier]
    ids_tr = ids_tr[tr_smaller_than_outlier]
    return y_tr, x_tr, ids_tr

In [19]:
def remove_all_NAN_columns(x_tr, x_te, headers_jet):
    print("Removing nan columns")
    nan_cols = []
    # Find all columns with -999
    for col_idx in range(x_tr.shape[1]):
        col = x_tr[:, col_idx]
        nb_nan_in_col = len(x_tr[col == -999])
        # A column has all NaN if len of col = nb NaN values in col
        if (nb_nan_in_col == len(col)):
            nan_cols.append(col_idx)
    
    # Remove all nan columns
    x_tr_updated = np.delete(x_tr, nan_cols, axis=1)
    x_te_updated = np.delete(x_te, nan_cols, axis=1)
    headers_jet_updated = np.delete(headers_jet, nan_cols)
    
    return x_tr_updated, x_te_updated, headers_jet_updated    

In [20]:
def split_data_according_to_mass(x, y, ids):
        # Get all the rows idx with invalid mass (i.e. DER_mass_MMC = -999)
        invalid_mass_row_idx = x[:, 0] == -999
        valid_mass_row_idx = x[:, 0] > 0
        # Process for each data table
        x_invalid_mass = x[invalid_mass_row_idx]
        x_valid_mass = x[valid_mass_row_idx]
        y_invalid_mass = y[invalid_mass_row_idx]
        y_valid_mass = y[valid_mass_row_idx]
        ids_invalid_mass = ids[invalid_mass_row_idx]
        ids_valid_mass = ids[valid_mass_row_idx]
        
        return x_invalid_mass, x_valid_mass, y_invalid_mass, y_valid_mass, ids_invalid_mass, ids_valid_mass 
    

In [21]:
def output_to_csv(x, y, ids, headers, jet, isTrain, isMassValid):
    """
    Write data into new csv file
    """
    # Add 'Id' & 'Prediction' to headers
    headers = np.insert(headers, 0, ['Id', 'Prediction'])
    
    # Remove 'DER_mass_MMC' if mass is not valid
    if not isMassValid:
        headers = np.delete(headers, np.where(headers =='DER_mass_MMC'))
    
    # Generate file name
    base = './data/train_' if isTrain else './data/test_'
    valid = '_valid_mass' if isMassValid else '_invalid_mass'
    file_name = base + 'jet_' + str(jet) + valid + '.csv'
    
    print("Outputing {}".format(file_name))
        
    with open(file_name, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()
        data_ = dict.fromkeys(headers)
        # Transform -1 and 1 into 's' and 'b'
        for id_, y_, x_ in zip(ids, y, x):
            data_['Id'] = int(id_)
            if (y_ != -1 and y_ !=1):
                raise Exception('Prediction not -1 and 1!!!')
            data_['Prediction'] = 's' if y_ == 1 else 'b'
            
            for idx, x_value in enumerate(x_):
                data_[headers[idx + 2]] = float(x_value)
            writer.writerow(data_)


In [22]:
def split_data_according_to_jet_and_mass(y_tr, x_tr, ids_tr, y_te, x_te, ids_te, headers): 
    for jet in range(4):
        print("\n\nSplitting for jet {}".format(jet))

        # PRI_jet_num (24 -> 24 - 3 cols dropped before 24 - 2 cols (id, label) = col 19)
        col_jet = 19
        
        # TRAIN - Get all the rows having Pri_jet_num = jet for TRAINING set and delete PRI_jet_num col
        x_tr_jet = x_tr[x_tr[:, col_jet] == jet]
        x_tr_jet = np.delete(x_tr_jet, col_jet, axis=1)
        # Delete PRI_jet_num in headers
        headers_jet = np.delete(headers, col_jet)

        # Using the row found in x_tr to select the rows in y and ids
        y_tr_jet = y_tr[x_tr[:, col_jet] == jet]
        ids_tr_jet = ids_tr[x_tr[:, col_jet] == jet]
        
        # TEST - Get all the rows having Pri_jet_num = jet for TEST set and delete PRI_jet_num col
        x_te_jet = x_te[x_te[:, col_jet] == jet]
        x_te_jet = np.delete(x_te_jet, col_jet, axis=1)
                
        # Using the row found in x_tr to select the rows in y and ids
        y_te_jet = y_te[x_te[:, col_jet] == jet]
        ids_te_jet = ids_te[x_te[:, col_jet] == jet]
            
        # Remove outliers
        y_tr_jet, x_tr_jet, ids_tr_jet = remove_outlier_in_DER_pt_h(y_tr_jet, x_tr_jet, ids_tr_jet, jet)

        # Remove col PRI_jet_all_pt from x because it only contains 0 values
        if jet == 0:
            print("Deleted col Pri_jet_all_pt in set with jet_num = 0")
            x_tr_jet = np.delete(x_tr_jet, -1, axis=1)
            x_te_jet = np.delete(x_te_jet, -1, axis=1)
            headers_jet = np.delete(headers_jet, -1)
        
        # Remove all the columns with only NaN values
        x_tr_jet, x_te_jet, headers_jet = remove_all_NAN_columns(x_tr_jet, x_te_jet, headers_jet)

        # Split the dataset again into valid/invalid values of DER_mass_MMC 
        # TRAIN
        x_tr_jet_invalid_mass, x_tr_jet_valid_mass, y_tr_jet_invalid_mass, y_tr_jet_valid_mass, ids_tr_jet_invalid_mass, ids_tr_jet_valid_mass = split_data_according_to_mass(x_tr_jet, y_tr_jet, ids_tr_jet)
        # TEST
        x_te_jet_invalid_mass, x_te_jet_valid_mass, y_te_jet_invalid_mass, y_te_jet_valid_mass, ids_te_jet_invalid_mass, ids_te_jet_valid_mass = split_data_according_to_mass(x_te_jet, y_te_jet, ids_te_jet)
        
        # Remove 'DER_mass_MMC' (col 0) if the mass is not valid
        x_tr_jet_invalid_mass = np.delete(x_tr_jet_invalid_mass, 0, axis=1)
        x_te_jet_invalid_mass = np.delete(x_te_jet_invalid_mass, 0, axis=1)
        
        # Save into CSV
        #x, y, ids, headers, jet, isTrain, isMassValid
        # TRAIN
        output_to_csv(x_tr_jet_invalid_mass, y_tr_jet_invalid_mass, ids_tr_jet_invalid_mass, headers_jet, jet, True, False)
        output_to_csv(x_tr_jet_valid_mass, y_tr_jet_valid_mass, ids_tr_jet_valid_mass, headers_jet, jet, True, True)

        # TEST
        output_to_csv(x_te_jet_invalid_mass, y_te_jet_invalid_mass, ids_te_jet_invalid_mass, headers_jet, jet, False, False)
        output_to_csv(x_te_jet_valid_mass, y_te_jet_valid_mass, ids_te_jet_valid_mass, headers_jet, jet, False, True)


In [574]:
split_data_according_to_jet_and_mass(y_tr, x_tr, ids_tr, y_te, x_te, ids_te, headers)




Splitting for jet 0
Removing outliers in Der_pt_h
Deleted col Pri_jet_all_pt in set with jet_num = 0
Removing nan columns
Outputing ./data/train_jet_0_invalid_mass.csv
Outputing ./data/train_jet_0_valid_mass.csv
Outputing ./data/test_jet_0_invalid_mass.csv
Outputing ./data/test_jet_0_valid_mass.csv


Splitting for jet 1
Removing outliers in Der_pt_h
Removing nan columns
Outputing ./data/train_jet_1_invalid_mass.csv
Outputing ./data/train_jet_1_valid_mass.csv
Outputing ./data/test_jet_1_invalid_mass.csv
Outputing ./data/test_jet_1_valid_mass.csv


Splitting for jet 2
Removing outliers in Der_pt_h
Removing nan columns
Outputing ./data/train_jet_2_invalid_mass.csv
Outputing ./data/train_jet_2_valid_mass.csv
Outputing ./data/test_jet_2_invalid_mass.csv
Outputing ./data/test_jet_2_valid_mass.csv


Splitting for jet 3
Removing outliers in Der_pt_h
Removing nan columns
Outputing ./data/train_jet_3_invalid_mass.csv
Outputing ./data/train_jet_3_valid_mass.csv
Outputing ./data/test_jet_3_inval

In [23]:
# Read the real files
def generate_processed_filenames(isTrain):
    file_names = []
    isMassValids = [True, False]
    jets = range(4)
    
    for isMassValid in isMassValids:
        for jet in jets:
            # Generate file name
            base = './data/train_' if isTrain else './data/test_'
            valid = '_valid_mass' if isMassValid else '_invalid_mass'
            file_name = base + 'jet_' + str(jet) + valid + '.csv'
            file_names.append(file_name)
                
    return file_names

file_names = generate_processed_filenames(True)
file_names

['./data/train_jet_0_valid_mass.csv',
 './data/train_jet_1_valid_mass.csv',
 './data/train_jet_2_valid_mass.csv',
 './data/train_jet_3_valid_mass.csv',
 './data/train_jet_0_invalid_mass.csv',
 './data/train_jet_1_invalid_mass.csv',
 './data/train_jet_2_invalid_mass.csv',
 './data/train_jet_3_invalid_mass.csv']

In [24]:
def load_processed_data(isTrain):
    """Load all Train/Test processed data"""
    file_names = generate_processed_filenames(isTrain)

    ys = []
    xs = []
    ids = []

    for i in range(4):
        y, x, id_ = load_csv_data(file_names[i], cut_values = False)
        ys.append(y)
        xs.append(x)
        ids.append(id_)
        
    return ys, xs, ids

In [25]:
ys_train, xs_train, ids_train = load_processed_data(True)

In [100]:
def standardize(x, mean=None, std=None):
    """Standardize the original data set."""
    if mean is None or std is None:
        mean_x = np.mean(x, axis = 0)
        std_x = np.std(x, axis = 0)
    else:
        mean_x = mean
        std_x = std
        
    x = x - mean_x
    x = x / std_x 

    return x, mean_x, std_x

# def standardize(x, mean_x=None, std_x=None):
#     """
#         Standardize the original data set.
#     """
#     if mean_x is None:
#         mean_x = np.mean(x, axis=0)
#     x = x - mean_x
#     if std_x is None:
#         std_x = np.std(x, axis=0)
#     x[:, std_x > 0] = x[:, std_x > 0] / std_x[std_x > 0]

#     tx = np.hstack((np.ones((x.shape[0], 1)), x))
#     return tx, mean_x, std_x

In [27]:
# Standandize xs
x_means = []
x_stds = []
x_standardized = []

for x_ in xs_train:
    x, mean_x, std_x = standardize(x_)
    x_standardized.append(x)
    x_means.append(mean_x)
    x_stds.append(std_x)

In [36]:
# Initialize a weights dictionary for each jet
weights_jet = dict.fromkeys(range(4))
degrees_jet = dict.fromkeys(range(4))
lambdas_jet = dict.fromkeys(range(4))

# build w using ridge regression
k_fold = 10
degrees = np.arange(3, 7)
lambdas = np.logspace(-10, -1, 60)
seed = 12

In [37]:
file_names

['./data/train_jet_0_valid_mass.csv',
 './data/train_jet_1_valid_mass.csv',
 './data/train_jet_2_valid_mass.csv',
 './data/train_jet_3_valid_mass.csv',
 './data/train_jet_0_invalid_mass.csv',
 './data/train_jet_1_invalid_mass.csv',
 './data/train_jet_2_invalid_mass.csv',
 './data/train_jet_3_invalid_mass.csv']

In [39]:
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    # form data with polynomial degree
    tx_tr = build_poly(x_tr, degree)
    tx_te = build_poly(x_te, degree)
    # ridge regression
    w = ridge_regression(y_tr, tx_tr, lambda_)
    # calculate the loss for train and test data
    loss_tr = np.sqrt(2 * compute_loss(y_tr, tx_tr, w))
    loss_te = np.sqrt(2 * compute_loss(y_te, tx_te, w))
    return loss_tr, loss_te, w

def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

In [42]:
def perc_wrong_pred(y, tx, w_star):
    """
        Return the percentage of wrong predictions (between 0 and 1)
    """

    pred = np.dot(tx, w_star)

    pred[pred > 0] = 1
    pred[pred <= 0] = -1

    right = np.sum(pred == y)
    wrong = len(pred) - right

    return float(wrong) / float(len(pred))

In [44]:

# Train with ridge regression for each tx in x and save the weights
for tx, y, f in zip(x_standardized, ys_train, file_names):
    print("Training for {}".format(f))
    
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    # for each degree, we compute the best lambdas and the associated rmse
    best_lambdas = []
    best_rmses = []
    best_w = []
    # vary degree
    for degree in degrees:
        # cross validation
        rmse_te = []
        w_te = []
        for lambda_ in lambdas:
            rmse_te_tmp = []
            w_te_temp = []
            for k in range(k_fold):
                _, loss_te, w = cross_validation(y, tx, k_indices, k, lambda_, degree)
                rmse_te_tmp.append(loss_te)
                w_te_temp.append(w)

            rmse_te.append(np.mean(rmse_te_tmp))
            # mean of all the ws ?????
            #w_te.append(np.mean(w_te_temp, axis=0))

        ind_lambda_opt = np.argmin(rmse_te)
        best_lambdas.append(lambdas[ind_lambda_opt])
        best_rmses.append(rmse_te[ind_lambda_opt])
        #best_w.append(w_te[ind_lambda_opt])
        print("For degree {deg}, te_loss={te_loss}, lambda={lambda_}".format(deg=degree, te_loss=rmse_te[ind_lambda_opt], lambda_=lambdas[ind_lambda_opt]))

    # find the one having the least test error
    ind_best_degree = np.argmin(best_rmses)

    #best_weights = best_w[ind_best_degree]
    #print("\n\n Best weights shape:", best_weights.shape)
    best_degree = degrees[ind_best_degree]
    print("\n\n Best degree:", best_degree)
    best_lambda = best_lambdas[ind_best_degree]
    print("Best lambda:", best_lambda, "\n\n")
    
    tx_extended = build_poly(tx, best_degree)
    w_star = ridge_regression(y, tx, best_lambda)
    
    print("Wrong prediction : {}", perc_wrong_pred(y, tx, w_star))



    # record the weights & degree
    #weights_jet[jet] = best_weights
    degrees_jet[f] = best_degree
    lambdas_jet[f] = best_lambda

    

Training for ./data/train_jet_0_valid_mass.csv
For degree 3, te_loss=0.7635206423901957, lambda=0.0014773776525985128
For degree 4, te_loss=0.7572822576260646, lambda=0.001039798418481492
For degree 5, te_loss=0.7539459171712302, lambda=1.5361749466718298e-05
For degree 6, te_loss=0.778244079494439, lambda=0.049535352089591804


 Best degree: 5
Best lambda: 1.5361749466718298e-05 


Wrong prediction : {} 0.272859098239575
Training for ./data/train_jet_1_valid_mass.csv
For degree 3, te_loss=0.8343966242615828, lambda=0.0014773776525985128
For degree 4, te_loss=0.830890556297893, lambda=0.002982471286216894
For degree 5, te_loss=0.8259011729532861, lambda=0.002982471286216894
For degree 6, te_loss=0.8345118021935469, lambda=0.00017957144943716409


 Best degree: 5
Best lambda: 0.002982471286216894 


Wrong prediction : {} 0.33327141264896687
Training for ./data/train_jet_2_valid_mass.csv
For degree 3, te_loss=0.7881063782221676, lambda=0.004237587160604063
For degree 4, te_loss=0.7772331

In [51]:
degrees_jet, lambdas_jet

degrees_final = {k: v for k, v in degrees_jet.items() if v is not None}
lambdas_final = {k: v for k, v in lambdas_jet.items() if v is not None}
degrees_final, lambdas_final

({'./data/train_jet_0_valid_mass.csv': 5,
  './data/train_jet_1_valid_mass.csv': 5,
  './data/train_jet_2_valid_mass.csv': 6,
  './data/train_jet_3_valid_mass.csv': 4},
 {'./data/train_jet_0_valid_mass.csv': 1.5361749466718298e-05,
  './data/train_jet_1_valid_mass.csv': 0.002982471286216894,
  './data/train_jet_2_valid_mass.csv': 0.006020894493336138,
  './data/train_jet_3_valid_mass.csv': 1.3664483492953244e-08})

In [48]:
ys_test, xs_test, ids_test = load_processed_data(False)

In [57]:
len(ids_test)

4

In [82]:
# Generate the weights
weights = []
for x, y, f in zip(x_standardized, ys_train, file_names):
    x_poly = build_poly(x, degrees_jet[f])
    w = ridge_regression(y, x_poly, lambdas_jet[f])
    print(len(w), degrees_jet[f])
    weights.append(w)

weights

76 5
91 5
145 6
97 4


[array([ 1.39732676e-01, -1.21615933e-02, -2.46224727e-01, -4.84678679e-01,
        -4.31864266e-02,  3.97199825e-01, -4.30686437e-02,  2.61800306e-01,
         6.78592734e-01, -2.40400803e-02,  1.13537588e+00,  2.59783796e-03,
        -7.42486433e-01,  1.56782507e-02, -2.78967573e-02,  3.30244083e-02,
        -1.07905700e-01,  4.11505045e-02, -3.52251116e-02, -1.01748410e-02,
         2.43856576e-02, -9.43890797e-03, -6.86501610e-03, -1.28483014e-01,
        -1.95410753e-01, -2.42422815e-01, -7.75303129e-03, -1.63729861e-02,
        -5.30673120e-02,  2.52148245e-02, -8.64859576e-03,  3.47634881e-02,
         4.03537833e-02,  2.78796424e-02,  3.03169887e-02, -2.19862556e-02,
         3.29721565e-02, -3.74319679e-03,  6.53059279e-02,  2.53969800e-01,
         2.40587681e-02, -1.29284390e-03,  4.13501428e-02, -9.77236550e-03,
        -1.25078663e-02,  2.32038042e-03, -3.74891876e-03, -1.00276556e-02,
        -3.16488512e-03, -1.60644645e-02,  8.36052376e-04, -2.78843553e-03,
         8.4

tx_poly = build_poly(x_test, degree)
    y_pred = predict_labels(weight, tx_poly)
    create_csv_submission(ids_test, y_pred,  name +".csv")

In [68]:
def predict_labels(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1

    return y_pred

def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id': int(r1), 'Prediction': int(r2)})

In [107]:
def predict(xs_test, ids_test, x_means, x_stds, degrees, weights):
    #_, xs_test, ids_test = load_processed_data(False)
    idx = 0
    ids = []
    y_preds = []
    for x, id_, mean, std, degree, weight in zip(xs_test, ids_test, x_means, x_stds, degrees, weights):
        x_std, _, _ = standardize(x, mean, std)
        x_expanded = build_poly(x_std, degree)
        y_pred = predict_labels(weight, x_expanded)
        ids = np.append(ids, id_)
        y_preds = np.append(y_preds, y_pred)
        idx = idx + 1
    return ids, y_preds

degrees = list(degrees_final.values())

idds, yys = predict(xs_test, ids_test, x_means, x_stds, degrees, weights)

create_csv_submission(idds, yys, "Desperation.csv")

In [108]:
len(idds)

481750

In [None]:
from plots import cross_validation_visualization

def cross_validation_demo():
    seed = 1
    degree = 7
    k_fold = 4
    lambdas = np.logspace(-4, 0, 30)
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    degree = 7
    for lambda_ in lambdas:
        loss_tr = 0
        loss_te = 0
        for idx in range(k_fold):
            e_tr, e_te = cross_validation(y, x, k_indices, idx, lambda_, degree)
            loss_tr += e_tr
            loss_te += e_te

        rmse_tr.append(loss_tr/k_fold)
        rmse_te.append(loss_te/k_fold)

    cross_validation_visualization(lambdas, rmse_tr, rmse_te)
    return rmse_tr, rmse_te

    

a, b = cross_validation_demo()