In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.conv_learner import *
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

import pandas as pd

In [3]:
PATH = "data/"
labels_df = pd.read_csv(f"{PATH}labels.csv")

In [4]:
models_dict = {"resnet34": [resnet34, 299]}

In [5]:
model_name = "resnet34"
nfold = 5

In [6]:
f_model = models_dict[model_name][0]
tfms = tfms_from_model(f_model, models_dict[model_name][1])
bm = BasicModel(f_model(True).cuda(), name=model_name)
folds = list(StratifiedKFold(n_splits=nfold, shuffle=True, random_state=2018).split(labels_df, labels_df['breed']))
for fold_idx, (build_index, val_index) in enumerate(folds):
    print(f"Fold: {fold_idx+1}")
    data = ImageClassifierData.from_csv(PATH, folder="train", csv_fname=f"{PATH}labels.csv", val_idxs=val_index,
                                        tfms=tfms, suffix=".jpg", test_name="test")
    learn = ConvLearner(data, bm)
    val_ids = [x.split("/")[1].split(".")[0] for x in data.val_dl.dataset.fnames]
    print("Val preds")
    val_preds, val_y = learn.TTA()
    val_df = pd.DataFrame(val_preds[0], columns=[model_name+f'_{i}' for i in range(val_preds[0].shape[1])])
    val_df['id'] = val_ids
    val_df['breed'] = [data.classes[x] for x in val_y]
    print("Test preds")
    test_preds, test_y = learn.TTA(is_test=True)
    if fold_idx == 0:
        train_imagenet_preds_df = val_df
        test_imagenet_preds = test_preds[0]
    else:
        train_imagenet_preds_df = pd.concat((train_imagenet_preds_df, val_df), ignore_index=True)
        test_imagenet_preds = test_imagenet_preds+test_preds[0]
test_ids = [x.split("/")[1].split(".")[0] for x in data.test_dl.dataset.fnames]
test_imagenet_preds_df = pd.DataFrame(test_imagenet_preds/nfold, columns=[model_name+f'_{i}' for i in range(test_imagenet_preds.shape[1])])
test_imagenet_preds_df['id'] = test_ids

Fold: 1
Val preds
Test preds                                   
Fold: 2                                      
Val preds
Test preds                                   
Fold: 3                                      
Val preds
Test preds                                   
Fold: 4                                      
Val preds
Test preds                                   
Fold: 5                                      
Val preds
Test preds                                   
                                             

In [7]:
train_imagenet_preds_df.shape, test_imagenet_preds_df.shape

((10222, 1002), (10357, 1001))

In [9]:
cols_for_model = np.setdiff1d(train_imagenet_preds_df.columns, ['id', 'breed'])
target_enc = LabelEncoder().fit(train_imagenet_preds_df['breed'])
y = target_enc.transform(train_imagenet_preds_df['breed'])

In [10]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=2018)

In [11]:
cross_val_score(lr, train_imagenet_preds_df[cols_for_model], y, cv=5, scoring='neg_log_loss')

array([-0.8965 , -0.97263, -0.86646, -0.87123, -1.03841])

In [12]:
train_imagenet_preds_df.head()

Unnamed: 0,resnet34_0,resnet34_1,resnet34_2,resnet34_3,resnet34_4,resnet34_5,resnet34_6,resnet34_7,resnet34_8,resnet34_9,...,resnet34_992,resnet34_993,resnet34_994,resnet34_995,resnet34_996,resnet34_997,resnet34_998,resnet34_999,id,breed
0,-0.070379,-1.07608,-0.009911,-0.337891,1.501714,1.016293,-3.892714,-2.150835,-1.124098,-1.062657,...,-1.850504,-0.176857,-0.349097,0.525262,4.404806,2.229181,1.119235,1.655074,0097c6242c6f3071762d9f85c3ef1b2f,bedlington_terrier
1,-4.109834,1.862212,-2.63879,-3.052189,-3.084376,-2.389941,-3.924857,-3.730031,-2.870955,-4.085822,...,-0.614067,-3.907355,-3.986597,0.338468,0.622629,-1.512924,1.547756,3.018553,0100f55e4f0fe28f2c0465d3fc4b9897,golden_retriever
2,-2.082167,-2.613133,-0.390231,-1.741671,-2.456761,-3.437425,-3.569693,-0.294346,-2.450206,1.969398,...,-2.192272,-2.836021,-1.387077,-3.307569,-0.911991,-1.979435,2.805132,1.905234,013c43f40f0fb13fa25ac0c2a70fd48f,lakeland_terrier
3,-0.480021,-1.02604,-0.329162,-0.494634,-1.020971,-0.748823,-2.104345,-1.900351,-3.337512,-2.217547,...,-2.72506,0.244166,-0.584584,0.866015,0.085541,-1.085953,2.041765,1.654655,015b363b062f602e7ec04ce28e640d05,walker_hound
4,-0.325908,1.281235,-0.516538,-0.750131,-1.060617,-0.411316,-1.993077,0.072701,0.717892,-2.709718,...,0.145767,1.324569,-0.525506,-0.984244,1.074997,1.534111,1.397253,1.104915,01e2245b46eb747260ff80f1c892daef,brittany_spaniel


In [13]:
from sklearn.model_selection import train_test_split

In [15]:
b, v, y_b, y_v = train_test_split(train_imagenet_preds_df[cols_for_model], y, test_size=0.2, stratify=y, random_state=2018)

In [18]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=2018)

In [19]:
lr.fit(b, y_b)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=2018, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [27]:
log_loss(y_v, lr.predict_proba(v).clip(0.001, 0.999))

0.849637563631287

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
accuracy_score(y_v, lr.predict(v))

0.8283618581907091

In [59]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
t_pca = pca.fit_transform(train_imagenet_preds_df.loc[:, cols_for_model])

In [28]:
import xgboost as xgb

In [60]:
# dtrain = xgb.DMatrix(train_imagenet_preds_df.loc[:, cols_for_model[:10]], y)
dtrain = xgb.DMatrix(t_pca, y)

In [67]:
params = {'objective': 'multi:softprob',
          'booster': 'gbtree',
          'eval_metric': 'mlogloss',
          'nthread': 4,
          'num_class': 120,
          'silent': 1,
          'max_depth': 6,
          'subsample': 0.8,
          'min_child_weight': 1,
          "colsample_bytree": 0.9,
          'eta': 0.1,
          'verbose_eval': True,
          'seed': 0}

In [34]:
train_imagenet_preds_df.to_csv('train_resnet34.csv', index=False)
test_imagenet_preds_df.to_csv('test_resnet34.csv', index=False)

In [35]:
from IPython.display import FileLink

In [36]:
FileLink('train_resnet34.csv')

In [37]:
FileLink('test_resnet34.csv')

In [64]:
clf_xgb_cv = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=30, nfold=5, verbose_eval=True)

[0]	train-mlogloss:4.00163+0.00812278	test-mlogloss:4.16673+0.0196928
[1]	train-mlogloss:3.47353+0.00811968	test-mlogloss:3.76275+0.0254611
[2]	train-mlogloss:3.09452+0.00578946	test-mlogloss:3.48495+0.0256306
[3]	train-mlogloss:2.79809+0.00588376	test-mlogloss:3.27116+0.0262844
[4]	train-mlogloss:2.56101+0.00588586	test-mlogloss:3.09932+0.0244681
[5]	train-mlogloss:2.36281+0.00557564	test-mlogloss:2.95497+0.0251979
[6]	train-mlogloss:2.19296+0.00618565	test-mlogloss:2.83186+0.0244697


KeyboardInterrupt: 