## Imports

In [1]:
# Python standard library
import csv
import json
import datetime
import time
import sys

# Scipy
import numpy as np
import sklearn.linear_model
import sklearn.model_selection
import sklearn.utils
import pandas as pd

import matplotlib.pyplot as plt

# Jupyter
import IPython.display

# Program-specific
sys.path.append('../modules')
import constants
import functions

## Get number of rows in data

In [2]:
train_file_path = constants.base_file_path + '/data/train/train_all_filled.csv'
test_file_path = constants.base_file_path + '/data/test/raw/test_reformatted.csv'

In [3]:
with open(train_file_path, 'r') as train_file_handle:
    N_train_all = len(train_file_handle.readlines()) - 1
print(N_train_all)

12832


## K-Folds cross-validation for C

In [None]:
# K-Folds parameters
K = 10
kfolds = sklearn.model_selection.KFold(n_splits = K)
Cs = [1e-5*(1.5**i) for i in range(7)]
errors = [0 for i in range(len(Cs))]

print(Cs)


# Mini-batch parameters
num_passes = 5
batch_size = 500

In [None]:
t0 = time.time()


# Hyper parameter loop
for i, C in enumerate(Cs):
    
    
    '''
    Benchmarking
    '''
    IPython.display.clear_output()
    
    t = time.time()
    print('i = ', str(i), '/', str(len(Cs)), '\t\tC = ', C)
    print('dt = ', str((t - t0)/60.))
    
    
    
    
    # K-folds loop
    for j, (train_indices, valid_indices) in enumerate(kfolds.split(np.empty((N_train_all)))):
        print('\tj = ', str(j), '/', K)
    
    
        # Create the model trained on this fold
        model = sklearn.linear_model.SGDClassifier(loss= 'log', alpha = 1./C)
    
                                                     
                                                     
        # Mini-batch loop
        for k in range(num_passes):
            print('\t\tk = ', str(k), '/', num_passes)
                                                     
            np.random.shuffle(train_indices)
            
            for l in range(int(len(train_indices)/batch_size + 1)):
                print('\t\t\tl = ', str(l), '/', str(int(len(train_indices)/batch_size + 1)))

                # Get batch indices
                if l == int(len(train_indices)/batch_size+1):
                    # Last batch
                    batch_train_indices = train_indices[l*batch_size:]
                else:
                    # Not last batch
                    batch_train_indices = train_indices[l*batch_size:(l+1)*batch_size]
                                                     
                                
                                                     
                # Load in the training data for this particular batch
                df_train = pd.read_csv(train_file_path, skiprows = [row + 1 for row in range(N_train_all) if row not in batch_train_indices], sep = ',', header = 0)


                # Fit model
                model.partial_fit(df_train[constants.inputs], df_train[constants.output], classes = [0, 1])
                                                     
                                                     
                                                     
                                                     
                        
                
                                                     
        # Load the validation data
        df_valid = pd.read_csv(train_file_path, skiprows = [row + 1 for row in range(N_train_all) if row not in valid_indices], sep = ',', header = 0)
                                                                       
        
        # Test model
        predictions = model.predict_proba(df_valid[constants.inputs])

        # Get errors
        errors[i] += functions.LogLoss(predictions, df_valid[constants.output])/K
        

    print(Cs[i], errors[i])
                                                     

                                                     
        
        
        
        

In [None]:
df_train.head()

In [None]:
for input in constants.inputs:
    print(input)
    if input not in df_train.columns.values:
        print('asdf')

In [None]:
print(df_train.columns.values)

In [None]:
print(df_train[constants.inputs])

In [None]:
plt.semilogx(Cs, errors, marker = 'o')
plt.xlabel('C')
plt.ylabel('Log loss')
plt.grid()
plt.show()

In [None]:
print(errors)

In [None]:
print(Cs)

In [None]:
for i in range(len(Cs)):
    print(Cs[i], '\t\t', errors[i])

## Make predictions

In [None]:
df_train = functions.LoadTrainData(aug = False, mix = True)
N_train_all = len(df_train)




In [None]:
df_train.loc[df_train['inc_angle'] == 'na', 'inc_angle'] = 0

##### Train model

In [None]:
num_passes = 20
batch_size = 500

In [None]:
t0 = time.time()

C = 7.6e-5



train_indices = [i for i in range(N_train_all)]


# Create the model trained on this fold
model = sklearn.linear_model.SGDClassifier(loss= 'log', alpha = 1./C, n_jobs = -1)


# Mini-batch loop
for i in range(num_passes):
    print('i = ', str(i), '/', num_passes)

    np.random.shuffle(train_indices)

    for j in range(int(len(train_indices)/batch_size + 1)):
        print('\tj = ', str(j), '/', str(int(len(train_indices)/batch_size + 1)))

        # Get batch indices
        if j == int(len(train_indices)/batch_size+1):
            # Last batch
            batch_train_indices = train_indices[j*batch_size:]
        else:
            # Not last batch
            batch_train_indices = train_indices[j*batch_size:(j+1)*batch_size]



        # Load in the training data for this particular batch
        #df_train = pd.read_csv(train_file_path, skiprows = [row + 1 for row in range(N_train_all) if row not in batch_train_indices], sep = ',', header = 0)

        # Fit model
        model.partial_fit(df_train[constants.inputs].iloc[train_indices], df_train[constants.output].iloc[train_indices], classes = [0, 1])

##### Make predictions

In [None]:
df_test = pd.read_csv(test_file_path, header = 0, sep = ',', index_col = 'id')

predictions = model.predict_proba(df_test[constants.inputs])[:,1]

In [None]:
df_train[constants.inputs].head()

In [None]:
df_test[constants.inputs].head()

## Write to file

In [None]:
output_file_path = '../data/submissions/submission_' + str(datetime.datetime.now().date()) + '_' + str(datetime.datetime.now().time()).replace(':', '-').split('.')[0]

with open(output_file_path, 'w') as output_file_handle:
    file_writer = csv.writer(output_file_handle, delimiter = ',')
    
    file_writer.writerow(['id', 'is_iceberg'])
    
    for i in range(len(predictions)):
        file_writer.writerow([df_test.index.values[i], predictions[i]])
        