In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
from glob import glob
from tqdm import tqdm
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import explained_variance_score

In [13]:
def get_labels():
    labels = [
        'slash_burn',
        'clear',
        'blooming',
        'primary',
        'cloudy',
        'conventional_mine',
        'water',
        'haze',
        'cultivation',
        'partly_cloudy',
        'artisinal_mine',
        'habitation',
        'bare_ground',
        'blow_down',
        'agriculture',
        'road',
        'selective_logging'
    ]

    l_map = {l: i for i, l in enumerate(labels)}
    inv_map = {i: l for l, i in l_map.items()}
    return labels, l_map, inv_map

labels, label_map, inv_label_map = get_labels()
print(labels)

def dfsub_to_np(df_sub):
    sub = np.zeros((len(df_sub), 17), dtype=int)
    for i, tags in tqdm(enumerate(df_sub.tags), miniters=1000):
        for t in tags.split(' '):
            sub[i][label_map[t]] = 1
    return sub

def vote(preds):
    return (preds.mean(axis=0) > 0.5).astype(int)

def to_tagging(one_hot_data, inv_label_map):
    res = pd.DataFrame(index=range(len(one_hot_data)), columns=['tags'])
    for j in tqdm(range(len(one_hot_data))):
        tags = []
        for i in range(17):
            if one_hot_data[j][i] == 1:
                tags.append(inv_label_map[i])
        res['tags'][j] = ' '.join(sorted(tags))
    return res

def get_test_df():
    inputs_dir = 'inputs'
    df_submission = pd.read_csv(os.path.join(inputs_dir, 'sample_submission_v2.csv'))
    return df_submission

def create_submission(pred, sufix):
    print("Reading submission dataframe")
    df_submission = get_test_df()
    print("Creating tags")
    p_tags = to_tagging(pred, inv_label_map)
    df_submission.tags = p_tags.tags
    submission_dir = os.path.join('submissions', 'ensemble_vote_test')
    if not os.path.exists(submission_dir):
        os.makedirs(submission_dir)
        
    file_name = 'submission_' + sufix + '.csv'
    print("Saving submission to file")
    df_submission.to_csv(os.path.join(submission_dir, file_name), index=False)
    print("%s created" % os.path.join(submission_dir, file_name))

['slash_burn', 'clear', 'blooming', 'primary', 'cloudy', 'conventional_mine', 'water', 'haze', 'cultivation', 'partly_cloudy', 'artisinal_mine', 'habitation', 'bare_ground', 'blow_down', 'agriculture', 'road', 'selective_logging']


In [14]:
submissions_dir = os.path.join('best_submissions', 'top2_submissions')
sub_files = glob(os.path.join(submissions_dir, '*.csv'))
submissions = []
for s_f in sub_files:
    df_s = pd.read_csv(s_f)
    sub = dfsub_to_np(df_s)
    submissions.append(sub)
submissions = np.array(submissions)
submissions.shape

61191it [00:00, 449940.41it/s]
61191it [00:00, 440229.26it/s]
61191it [00:00, 443549.41it/s]


(3, 61191, 17)

In [15]:
def mean_matthews_corrcoef(p1, p2):
    mean_mcc = 0
    for i in range(17):
        mean_mcc += matthews_corrcoef(p1[:,i], p2[:,i])
    mean_mcc /= 17
    return mean_mcc

In [16]:
mcc_list = []
ij = []
for i in tqdm(range(len(submissions))):
    for j in range(len(submissions)):
        if i < j:
            mcc = mean_matthews_corrcoef(submissions[i], submissions[j])
            mcc_list.append(mcc)
            ij.append([i,j])
#             print("Matthews correlation of [%d,%d] is %f" % (i,j,mcc))
ij = np.array(ij)
df_corr = pd.DataFrame(columns=['i', 'j', 'mcc'])
df_corr['mcc'] = mcc_list
df_corr['i'] = ij[:,0]
df_corr['j'] = ij[:,1]
df_corr.sort_values(by='mcc')

100%|██████████| 3/3 [00:00<00:00,  8.47it/s]


Unnamed: 0,i,j,mcc
2,1,2,0.947169
1,0,2,0.950918
0,0,1,0.977665


In [17]:
def get_mcc(i,j):
    if i < j:
        return df_corr[(df_corr.i == i) & (df_corr.j == j)].mcc.values[0]
    else:
        return df_corr[(df_corr.i == j) & (df_corr.j == i)].mcc.values[0]

def find_best_candidate(init_ens):
    min_mcc = 1
    best_candidate = -1
    a = np.arange(len(submissions))
    for i in a:
        if i not in init_ens:
            mcc_of_i = []
            for j in range(len(init_ens)):
                mcc_of_i.append(get_mcc(i, init_ens[j]))
                avgmcc = np.mean(mcc_of_i)
            if avgmcc < min_mcc:
                best_candidate = i
                min_mcc = avgmcc
    return best_candidate

In [18]:
sub_files

['best_submissions\\top2_submissions\\submission__vote_179_193_201.csv',
 'best_submissions\\top2_submissions\\submission__vote_201_195_198.csv',
 'best_submissions\\top2_submissions\\submission__vote_split_3_5_7.csv']

In [19]:
vote_ens = [0,1,2]
vote_pred = vote(submissions[vote_ens])
for i in vote_ens:
    print(mean_matthews_corrcoef(submissions[i], vote_pred))

0.991830937393
0.985864938793
0.959063505694


In [20]:
create_submission(vote_pred, '_vote_last')

Reading submission dataframe
Creating tags


100%|██████████| 61191/61191 [00:18<00:00, 3247.25it/s]


Saving submission to file
submissions\ensemble_vote_test\submission__vote_last.csv created
