In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Only one time run if you do not have fastai in the current working directory or have not symlinked yet.
!ln -s ../fastai/fastai/ fastai 

In [2]:
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import os

In [3]:
from sklearn.metrics import f1_score

def f1(preds, targs):
    return f1_score(targs, np.argmax(preds, axis=1), average='micro')
    
metrics = [f1]

In [5]:
PATH= "data/"
sz = 300
bs = 64
# labels_csv = f'{PATH}labels_sample.csv'
labels_csv = f'{PATH}labels.csv'
labels = pd.read_csv(labels_csv)
SEED = 2018

In [6]:
model = resnet50

In [7]:
def get_data(sz, val_idxs, test_name=None):
    tfms = tfms_from_model(model, sz, aug_tfms=transforms_top_down, max_zoom=1.1)
    return ImageClassifierData.from_csv(PATH, 'train_all', labels_csv, tfms=tfms,
                    suffix='', val_idxs=val_idxs, test_name=test_name)

In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=False, random_state=SEED)
for fold, (b_id, v_id) in enumerate(skf.split(labels, labels['species'])):
    print(f'Fold: {fold+1}')
    data = get_data(sz, v_id, test_name='test')
    torch.manual_seed(SEED)
    learn = ConvLearner.pretrained(model, data, precompute=True, metrics=metrics)
    learn.precompute = False
    lr = 0.1
    learn.fit(lr, 3, cycle_len=1, cycle_mult=2)
    lrs = np.array([lr/100,lr/10,lr])
    learn.unfreeze()
    learn.fit(lrs, 3, cycle_len=1, cycle_mult=2)
    if fold == 0:
        val_log_preds = learn.TTA()
        val_preds_df = pd.DataFrame(val_log_preds[0][0], columns=data.classes)
        val_preds_df['file'] = np.vectorize(lambda x: os.path.basename(x))(data.val_ds.fnames)
        val_preds_df['species'] = data.val_ds.y
        train_preds_df = val_preds_df.copy()
        test_log_preds = learn.TTA(is_test=True)[0][0]
    else:
        test_log_preds = test_log_preds + learn.TTA(is_test=True)[0][0]
        val_log_preds = learn.TTA()
        val_preds_df = pd.DataFrame(val_log_preds[0][0], columns=data.classes)
        val_preds_df['file'] = np.vectorize(lambda x: os.path.basename(x))(data.val_ds.fnames)
        val_preds_df['species'] = data.val_ds.y
        train_preds_df = pd.concat((train_preds_df, val_preds_df), ignore_index=True)

Fold: 1
100%|██████████| 60/60 [01:06<00:00,  1.11s/it]
100%|██████████| 15/15 [00:17<00:00,  1.18s/it]


epoch      trn_loss   val_loss   f1                       
    0      1.208291   0.671543   0.769931  
    1      1.040046   0.558023   0.824375                 
    2      0.75204    0.430501   0.857986                  
    3      0.730463   0.443853   0.855694                  
    4      0.620886   0.426372   0.865278                  
    5      0.498035   0.322977   0.896528                  
    6      0.424227   0.314085   0.897569                  



epoch      trn_loss   val_loss   f1                        
    0      0.4925     0.216315   0.923889  
    1      0.341247   0.22705    0.912361                  
    2      0.198099   0.142984   0.955139                  
    3      0.187044   0.311119   0.895417                  
    4      0.150891   0.147363   0.954028                  
    5      0.111539   0.126027   0.966528                  
    6      0.084473   0.118437   0.965625                   

Fold: 2                                      
100%|██████████| 60/60 [01:05<00:00,  1.10s/it]
100%|██████████| 15/15 [00:18<00:00,  1.22s/it]


epoch      trn_loss   val_loss   f1                       
    0      1.235509   0.551195   0.812384  
    1      1.009985   0.530252   0.846836                 
    2      0.735725   0.383554   0.860494                  
    3      0.711683   0.435541   0.850733                  
    4      0.633401   0.343247   0.894213                  
    5      0.485709   0.278677   0.911265                  
    6      0.416501   0.278112   0.903974                  



epoch      trn_loss   val_loss   f1                        
    0      0.521161   0.229223   0.917515  
    1      0.337625   0.185547   0.929244                  
    2      0.210141   0.136496   0.948958                  
    3      0.218943   0.153732   0.944406                  
    4      0.164565   0.142105   0.957099                  
    5      0.120594   0.105216   0.968364                  
    6      0.086347   0.105973   0.969406                   

Fold: 3                                      
100%|██████████| 60/60 [01:06<00:00,  1.10s/it]
100%|██████████| 15/15 [00:17<00:00,  1.18s/it]


epoch      trn_loss   val_loss   f1                       
    0      1.225543   0.667517   0.760122  
    1      0.985529   0.79485    0.781211                  
    2      0.729135   0.480476   0.838895                  
    3      0.651501   0.466219   0.837854                  
    4      0.585864   0.392133   0.862461                  
    5      0.486367   0.341214   0.882252                  
    6      0.421973   0.331837   0.878086                  



epoch      trn_loss   val_loss   f1                        
    0      0.537532   0.22364    0.915586  
    1      0.35654    0.151728   0.944359                  
    2      0.216635   0.131649   0.953734                  
    3      0.197696   0.155999   0.944143                  
    4      0.165544   0.144377   0.940193                  
    5      0.124663   0.091286   0.965193                  
    6      0.093348   0.081824   0.970401                   

Fold: 4                                      
100%|██████████| 60/60 [01:05<00:00,  1.09s/it]
100%|██████████| 15/15 [00:17<00:00,  1.17s/it]


epoch      trn_loss   val_loss   f1                       
    0      1.250048   0.634108   0.794231  
    1      1.021005   0.652025   0.799279                 
    2      0.757653   0.398442   0.854407                  
    3      0.708578   0.427881   0.856651                  
    4      0.566276   0.423703   0.853846                  
    5      0.476203   0.304372   0.898798                  
    6      0.41175    0.293824   0.90609                   



epoch      trn_loss   val_loss   f1                        
    0      0.570264   0.266986   0.90609   
    1      0.349557   0.158299   0.948478                  
    2      0.215635   0.115672   0.966426                  
    3      0.230171   0.255438   0.926362                  
    4      0.179876   0.115127   0.96851                   
    5      0.124585   0.11021    0.966667                  
    6      0.102423   0.103124   0.96875                    

Fold: 5                                      
100%|██████████| 60/60 [01:05<00:00,  1.09s/it]
100%|██████████| 15/15 [00:18<00:00,  1.20s/it]


epoch      trn_loss   val_loss   f1                       
    0      1.252671   0.625764   0.784865  
    1      0.995649   0.620473   0.8221                    
    2      0.739849   0.446328   0.851797                  
    3      0.733677   0.566443   0.817136                  
    4      0.628434   0.415779   0.851266                  
    5      0.48505    0.323743   0.891136                  
    6      0.404668   0.324692   0.880188                  



epoch      trn_loss   val_loss   f1                        
    0      0.571332   0.240129   0.91248   
    1      0.386225   0.212108   0.923938                  
    2      0.250082   0.135734   0.951552                  
    3      0.206861   0.183572   0.93462                   
    4      0.163319   0.118332   0.965094                  
    5      0.130128   0.097027   0.967443                  
    6      0.09329    0.09883    0.965094                   

                                             

In [10]:
f1(np.exp(train_preds_df[data.classes].values), train_preds_df['species'])

0.9677894736842105

In [31]:
test_preds_df = pd.DataFrame(np.exp(test_log_preds/5), columns=data.classes)
test_preds_df['file'] = np.vectorize(lambda x: os.path.basename(x))(data.test_ds.fnames)

In [32]:
test_preds_df['species'] = [data.classes[i].replace('_', ' ') for i in np.argmax(test_preds_df[data.classes].values, axis=1)]

In [33]:
test_preds_df.head()

Unnamed: 0,Black-grass,Charlock,Cleavers,Common_Chickweed,Common_wheat,Fat_Hen,Loose_Silky-bent,Maize,Scentless_Mayweed,Shepherds_Purse,Small-flowered_Cranesbill,Sugar_beet,file,species
0,1.640355e-06,1.7514e-07,7.370861e-06,3.287016e-06,1.608192e-05,0.9996596,6.870133e-07,2.517068e-06,5.710407e-08,2.562196e-07,8.622136e-06,0.0001751617,12625488b.png,Fat Hen
1,3.622723e-07,1.305211e-08,3.470237e-07,1.316741e-07,1.330197e-06,0.9999396,3.130357e-07,7.326044e-08,1.174057e-08,6.038574e-08,2.486706e-06,4.568395e-06,c832e4302.png,Fat Hen
2,0.5954057,1.50711e-05,0.0008142756,0.0001240898,0.0007762262,0.000334803,0.3026007,2.263331e-05,0.0002644156,1.921733e-05,0.0002076544,9.889746e-05,e82017baa.png,Black-grass
3,6.4425e-08,4.145404e-07,2.409246e-05,1.981121e-05,6.293905e-09,2.17539e-08,8.150451e-07,4.776295e-08,0.9991668,0.0002179849,2.365305e-07,8.5825e-07,c10ccbd82.png,Scentless Mayweed
4,1.522028e-08,5.120585e-07,1.669442e-06,1.296239e-06,1.183461e-09,9.362329e-08,2.379516e-08,1.013752e-07,6.85403e-08,2.723275e-05,0.9999272,1.024319e-09,8b27bfd2b.png,Small-flowered Cranesbill


In [25]:
test_preds_df[['file', 'species']].to_csv('submit_kfold.csv', index=False)

In [26]:
from IPython.display import FileLink

In [27]:
FileLink('submit_kfold.csv')