In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
%matplotlib inline

## Submission results

In [None]:
results = {'Baseline': 0.0229,
           'Baseline + augment': 0.0278,
           
           #Using baseline model
           'HOG as 4th channel': 0.0346,
           'Clustered images': 0.0202,
           'Canny edge detection': 0.0247,
                      
           #Three models using location
           'Three models for location': 0.0280,
           
           #Structured pipeline
           'Sorted train/val split': 0.0416,
           'Blur-Noise': 0.0396,
           'Canny edge detection 2': 0.0304,
           
           #Structured pipeline and concatenation
           'Images + location': 0.0357,
           'Images + HOG': 0.0304,
           'Images + location + HOG': 0.0265,
           'Images + location + HOG + Canny': 0.0178
          }

comparisons = {'Zindi benchmark': 0.0387, 
               'Randomized': 0.0228,
               'Zindi top score': 0.0752
              }

In [None]:
color = ['coral', 'coral', 'coral', 'coral', 'coral', #Baseline
         'gold', #Three models for location
         'yellowgreen', 'yellowgreen', 'yellowgreen', #Structured data
         'lightblue', 'lightblue', 'lightblue', 'lightblue'] #Concat Models

plt.rcParams.update({'font.size': 14})

plt.figure(figsize=(18,10))
hbars = plt.barh(range(len(results)), list(results.values()), align='center', color=color)
plt.yticks(range(len(results)), list(results.keys()))
plt.axvline(x=comparisons['Zindi benchmark'], color='red')
plt.axvline(x=comparisons['Randomized'], color='black')
plt.axvline(x=comparisons['Zindi top score'], color='darkgreen')
plt.bar_label(hbars, fmt='%.3f', padding=3)
plt.text(comparisons['Zindi benchmark']+0.0002, len(results)-.5, 'Zindi Benchmark')
plt.text(comparisons['Randomized']+0.0002, len(results)-.5, 'Randomized')
plt.text(comparisons['Zindi top score']+0.0002, len(results)-.5, 'Zindi Top Score')
plt.xlim(right=0.09)
plt.xlabel('top 5 mean average precision (mAP5)')


legend_elements = [Patch(facecolor='lightblue', label='Sorted + Concat Models'),
                   Patch(facecolor='yellowgreen', label='Sorted'),
                   Patch(facecolor='gold', label='3 Models'),
                   Patch(facecolor='coral', label='Baseline'),
                  ]

# Create the figure
#fig, ax = plt.subplots()
plt.legend(handles=legend_elements, loc='lower right')

#plt.title('Results')
plt.show()
#plt.savefig('results_barplot.png', bbox_inches='tight', dpi=300)

In [None]:
results['Zindi benchmark']

## EDA submission files (turtle ids)

In [None]:
df = pd.read_csv('../data/submission.csv')
train = pd.read_csv('../data/train_corrected.csv')

In [None]:
df

In [None]:
df.nunique()

In [None]:
p1=df['prediction1'].value_counts()

In [None]:
p2=df['prediction2'].value_counts()

In [None]:
p3=df['prediction3'].value_counts()

In [None]:
p4=df['prediction4'].value_counts()

In [None]:
p5=df['prediction5'].value_counts()

In [None]:
p_full = pd.concat([p1, p2, p3, p4, p5], axis=0)

In [None]:
p_full.nunique()

In [None]:
df = p_full.reset_index()
df = df.rename(columns={'index': 'turtle_id', 0: 'count'})
df

In [None]:
df['turtle_id'].nunique()

In [None]:
df.groupby('turtle_id').sum().sort_values(by='count')

In [None]:
train['turtle_id'].value_counts()

In [None]:
df_id = df.groupby('turtle_id').sum().sort_values(by='turtle_id').reset_index()
train_id = pd.DataFrame(train['turtle_id'].value_counts()).reset_index()
train_id = train_id.rename(columns={'index': 'turtle_id', 'turtle_id': 'count'}).sort_values(by='turtle_id')

In [None]:
df_id

In [None]:
train_id

In [None]:
#pred_train = pd.concat([df_id, train_id], axis=1)
pred_train = df_id.merge(train_id, on='turtle_id', how='outer', suffixes=('_pred', '_train'))
pred_train

In [None]:
pred_train.sort_values(by='count_pred', ascending=False).head(20)

In [None]:
pred_train.sort_values(by='count_pred', ascending=False).tail(20)

In [None]:
pred_train.sort_values(by='count_train', ascending=False).head(20)

In [None]:
pred_train.sort_values(by='count_train', ascending=False).tail(20)