### imports

In [63]:
import os
import numpy as np
import pandas as pd

import env
from utils import KaggleCameraDataset, one_hot_decision_function, unhot

## useful routines

In [30]:
def get_proba(proba_path):
    return pd.read_csv(proba_path).as_matrix(columns=map(str, range(10)))

In [33]:
P = get_proba('../models/proba1.csv')
P[:3, :3]

array([[  2.30427057e-01,   4.84325022e-01,   5.16182883e-03],
       [  4.63270843e-02,   1.69466971e-03,   1.10656278e-04],
       [  1.59370884e-01,   1.33133292e-01,   2.71781767e-03]])

## #1 average different predictions (from raw probs)

In [55]:
def avg_proba(*proba_paths, **kwargs):
    return np.average([get_proba(path) for path in proba_paths], axis=0, weights=kwargs.get('weights', None))

In [61]:
Q = avg_proba('../models/proba1.csv', 
              '../models/proba_best.csv', 
               weights=[1., 1.])
Q

array([[  1.30430266e-01,   4.46921155e-01,   3.23994139e-02, ...,
          1.20126526e-03,   1.05716349e-02,   6.87616952e-02],
       [  3.24848127e-02,   8.78940028e-02,   4.32265330e-03, ...,
          8.78389790e-04,   6.59955412e-01,   4.10077199e-02],
       [  1.20631762e-01,   1.52186640e-01,   1.57862445e-03, ...,
          1.04656043e-03,   1.19606800e-01,   5.12661040e-01],
       ..., 
       [  1.16247602e-01,   4.08802994e-01,   5.17931608e-02, ...,
          3.80601780e-02,   1.16218922e-02,   1.64976966e-01],
       [  4.96196985e-01,   4.23220114e-03,   2.48291728e-03, ...,
          3.88299840e-01,   1.79937498e-02,   6.91680338e-02],
       [  2.15672876e-01,   3.66537021e-02,   2.02466367e-03, ...,
          1.86312740e-05,   3.95222452e-02,   1.85410858e-02]])

## #2 convert proba to predictions

In [64]:
def proba_to_subm(proba, subm_path):
    """
    Parameters
    ----------
    proba : (2640, 10) np.ndarray
    """
    test_dataset = KaggleCameraDataset('../data/', train=False, lazy=True)
    fnames = [os.path.split(fname)[-1] for fname in test_dataset.X]
    index_pred = unhot(one_hot_decision_function(proba))
    data = {'fname': fnames,
            'camera': [KaggleCameraDataset.target_labels()[int(c)] for c in index_pred]}
    df = pd.DataFrame(data, columns=['fname', 'camera'])
    df.to_csv(subm_path, index=False)

In [65]:
proba_to_subm(Q, 'subm.csv')

## #3 how predictions differ

In [28]:
def get_diff(subm_path1, subm_path2):
    df1 = pd.read_csv(subm_path1)
    df2 = pd.read_csv(subm_path2)
    return (df1 != df2)['camera'].sum()/2640.

In [29]:
get_diff('../models/submission.csv', '../models/submission_best.csv')

0.4693181818181818