In [2]:
import json
from tqdm import tqdm

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.neighbors import BallTree, NearestNeighbors

from catboost import CatBoostClassifier

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
[x for x in train.columns if 'nd_mean' not in x]

['id', 'area', '.geo', 'crop']

# geo features

In [5]:
def preprocess_geo(x):
    
    x = json.loads(x)
    
    lat, lon = 0, 0
    
    if x['type'] == 'Polygon':
        x = x['coordinates']
        lon = np.mean([np.mean([y[0] for y in xx]) for xx in x])
        lat = np.mean([np.mean([y[1] for y in xx]) for xx in x])
    
    elif x['type'] == 'GeometryCollection':
        x = x['geometries']
        lat, lon = 0, 0
        for x_ in x:
            x_ = x_['coordinates']
            lon += np.mean([y[0] for y in x_]) / len(x)
            lat += np.mean([y[1] for y in x_]) / len(x)
            
    return (lon, lat)

In [6]:
train['.geo'] = train['.geo'].apply(lambda x: preprocess_geo(x))
train['lon'] = train['.geo'].apply(lambda x: x[0])
train['lat'] = train['.geo'].apply(lambda x: x[1])
train = train.drop('.geo', axis=1)

In [7]:
test['.geo'] = test['.geo'].apply(lambda x: preprocess_geo(x))
test['lon'] = test['.geo'].apply(lambda x: x[0])
test['lat'] = test['.geo'].apply(lambda x: x[1])
test = test.drop('.geo', axis=1)

In [8]:
# tree = BallTree(train[['lon', 'lat']], leaf_size=2) 


# def geo_features(temp):
    
#     dist, ind = tree.query(temp[['lon', 'lat']], k=51)
#     dist, ind = dist[:, 1:], ind[:, 1:]
#     dist *= 107
    
#     nearest_fields_amount = []

#     for i in tqdm(range(temp.shape[0])):
#         dist_temp_res = np.zeros((9))
#         temp_dist = dist[i]
#         for j in range(1, 10):
#             dist_temp_res[j-1] += len(temp_dist[temp_dist < j])
#         nearest_fields_amount.append(dist_temp_res)
        
#     labs = []

#     for i in tqdm(range(temp.shape[0])):
#         labs.append(train.iloc[ind[i]].crop.values)
        
#     labs_count = []

#     for i in tqdm(range(temp.shape[0])):
#         lc_temp = np.zeros((train.crop.nunique()))
#         temp_dist = dist[i]
#         temp_labs = train.iloc[ind[i]].crop.values
#         for j in range(train.crop.nunique()):
#             lc_temp[j] += len(temp_labs[temp_labs == j])
#         labs_count.append(lc_temp)
        
#     train_nearest_dist = pd.DataFrame(nearest_fields_amount, 
#                                       columns=['nearest_fields_amount_{}'.format(x) for x in range(len(nearest_fields_amount[0]))])
    
#     train_nearest = pd.DataFrame([x[:10] for x in labs], 
#                                  columns=['nearest_labels_{}'.format(x) for x in range(10)], dtype=int)
    
#     train_nearest_count = pd.DataFrame(labs_count, 
#                                        columns=['nearest_label_{}_count'.format(x) for x in range(train.crop.nunique())])
    
#     train_nearest_count['most_frequent_label'] = train_nearest_count.apply(lambda x: np.where(x == x.max())[0][0], axis=1)
    
#     return pd.concat([train_nearest, train_nearest_count, train_nearest_dist], axis=1)

In [9]:
# train = pd.concat([train, geo_features(train)], axis=1)
# test = pd.concat([test, geo_features(test)], axis=1)

# ndvi features

In [10]:
ndvi_columns = np.sort([x for x in train.columns if 'nd_mean' in x])

In [11]:
len(ndvi_columns)

70

In [12]:
# for col in ndvi_columns:
#     train.loc[train[col] == 0, col] = np.nan
    
# for col in ndvi_columns:
#     test.loc[test[col] == 0, col] = np.nan

In [13]:
# X_train, X_val, y_train, y_val = train_test_split(train[ndvi_columns],
#                                                   train['crop'],
#                                                   test_size=0.15, 
#                                                   stratify=train['crop'],
#                                                   random_state=17)

In [14]:
# knn_svoe = NearestNeighbors(n_neighbors=1, metric='cosine')
# knn_svoe.fit(X_train, y_train)

In [15]:
# dist, idxs = knn_svoe.kneighbors(X_val, return_distance=True)

In [16]:
# recall_score(y_val, [x[0] for x in y_train.values[idxs]], average='macro')
# 0.8946843773187154

In [17]:
# recall_score(y_val, [np.argmax(np.bincount(x)) for x in y_train.values[idxs]], average='macro')

In [18]:
# for i in range(2, 12):
#     knn_svoe = NearestNeighbors(n_neighbors=i, metric='cosine')
#     knn_svoe.fit(train[ndvi_columns], train['crop'])
#     dist, idxs = knn_svoe.kneighbors(train[ndvi_columns], return_distance=True)
#     dist, idxs = dist[:, 1:], idxs[:, 1:]
#     rec_score = recall_score(train['crop'], [np.argmax(np.bincount(x)) for x in train.crop.values[idxs]], average='macro')
#     print(i, rec_score)
    
    
#     dist, idxs = knn_svoe.kneighbors(train[ndvi_columns], return_distance=True)
#     dist, idxs = dist[:, 1:], idxs[:, 1:]
#     train[f'most_freq_label_cosine_{i}'] = [np.argmax(np.bincount(x)) for x in train.crop.values[idxs]]
    
#     dist, idxs = knn_svoe.kneighbors(test[ndvi_columns], return_distance=True)
#     dist, idxs = dist[:, 1:], idxs[:, 1:]
#     test[f'most_freq_label_cosine_{i}'] = [np.argmax(np.bincount(x)) for x in train.crop.values[idxs]]

In [109]:
# def fix_ndvi(ndvi):

#     max_previous = ndvi[0]
    
#     for i in range(1, 48):
        
#         if ndvi[i] > max_previous:
#             max_previous = ndvi[i]
            
#         else:
#             max_next = max_previous
#             for j in range(i, 48):
#                 if ndvi[j] > max_next:
#                     max_next = ndvi[j]
#             ndvi[i] = (max_next + max_previous) / 2
            
#     # –––––––––––––––––––––––––––––––––––––––––––––
    
#     max_previous = ndvi[-1]
    
#     for i in range(len(ndvi)-1, 47, -1):
        
#         if ndvi[i] > max_previous:
#             max_previous = ndvi[i]
            
#         else:
#             max_next = max_previous
#             for j in range(i, 47, -1):
#                 if ndvi[j] > max_next:
#                     max_next = ndvi[j]
#             ndvi[i] = (max_next + max_previous) / 2
            
#     return ndvi

In [110]:
# plt.plot(ndvi)

In [111]:
# train_fix_ndvi = train[ndvi_columns].copy()
# train_fix_ndvi[ndvi_columns] = train_fix_ndvi[ndvi_columns].apply(lambda x: fix_ndvi(x), axis=1)

In [112]:
# test_fix_ndvi = test[ndvi_columns].copy()
# test_fix_ndvi[ndvi_columns] = test_fix_ndvi[ndvi_columns].apply(lambda x: fix_ndvi(x), axis=1)

In [103]:
train_fix_ndvi = train[ndvi_columns].copy()
for col in ndvi_columns:
    train_fix_ndvi.loc[train_fix_ndvi[col] == 0, col] = np.nan
    
test_fix_ndvi = test[ndvi_columns].copy()
for col in ndvi_columns:
    test_fix_ndvi.loc[test_fix_ndvi[col] == 0, col] = np.nan

In [93]:
train_fix_ndvi = train[ndvi_columns].copy()
for col in ndvi_columns:
    train_fix_ndvi.loc[train_fix_ndvi[col] < 0.1, col] = np.nan
    
test_fix_ndvi = test[ndvi_columns].copy()
for col in ndvi_columns:
    test_fix_ndvi.loc[test_fix_ndvi[col] < 0.1, col] = np.nan

In [120]:
def ndvi_features(train):

    train_agg = pd.DataFrame()
        
    ndvi_train = train[ndvi_columns].copy()

    # # rolling window mean
    # for j in range(2, 11):
    #     train[[f'rolling_{j}_mean_{i}' for i in range(len(ndvi_columns)-(j-1))]] = train[ndvi_columns].rolling(axis=1, window=j).mean().values[:, (j-1):]

    # rolling window sum
    for j in range(2, 7):
        train_agg[[f'rolling_{j}_sum_{i}' for i in range(len(ndvi_columns)-(j-1))]] = train[ndvi_columns].rolling(axis=1, window=j).sum().values[:, (j-1):]

    # # rolling window min
    # for j in range(2, 11):
    #     train_agg[[f'rolling_{j}_min_{i}' for i in range(len(ndvi_columns)-(j-1))]] = train[ndvi_columns].rolling(axis=1, window=j).min().values[:, (j-1):]

    # rolling window max
    for j in range(2, 7):
        train_agg[[f'rolling_{j}_max_{i}' for i in range(len(ndvi_columns)-(j-1))]] = train[ndvi_columns].rolling(axis=1, window=j).max().values[:, (j-1):]

    # # rolling window std
    # for j in range(2, 11):
    #     train_agg[[f'rolling_{j}_std_{i}' for i in range(len(ndvi_columns)-(j-1))]] = train[ndvi_columns].rolling(axis=1, window=j).std().values[:, (j-1):]

    # # rolling window median
    # for j in range(2, 11):
    #     train_agg[[f'rolling_{j}_median_{i}' for i in range(len(ndvi_columns)-(j-1))]] = train[ndvi_columns].rolling(axis=1, window=j).median().values[:, (j-1):]

    # # rolling window quantile
    # for j in range(2, 11):
    #     for k in range(1, 10, 1): 
    #         train_agg[[f'rolling_{j}_{k}_quatile_{i}' for i in range(len(ndvi_columns)-(j-1))]] = train[ndvi_columns].rolling(axis=1, window=j).quantile(k/10, axis=1).values[:, (j-1):]

    # speed
    for j in range(1, 11, 3): 
        for i in range(j, len(ndvi_columns), j):
            train_agg['speed_{}_{}'.format(j, ndvi_columns[i])] = ndvi_train[ndvi_columns[i]] - ndvi_train[ndvi_columns[i-j]]
    
    # train_agg['std_'] = ndvi_train.std(axis=1) 
    # train_agg['mean_'] = ndvi_train.mean(axis=1) 
    # train_agg['median_'] = ndvi_train.median(axis=1) 
    # train_agg['min_'] = ndvi_train.min(axis=1) 
    # train_agg['max_'] = ndvi_train.max(axis=1)
    # train_agg['sum_'] = ndvi_train.sum(axis=1)
        
    # print('Base done!')
    
    # for i in range(1, 10, 2): 
    #     train_agg['quant_{}'.format(i)] = ndvi_train.quantile(i/10, axis=1)
        
    # print('Quantile done!')
        
    # def max_day(x):
    #     try:
    #         return np.where(x == x.max())[0][0]
    #     except:
    #         return np.nan
    # def min_day(x):
    #     try:
    #         return np.where(x == x.min())[0][0]
    #     except:
    #         return np.nan
    
    # maxd = ndvi_train.apply(lambda x: max_day(x), axis=1)
    # mindl = ndvi_train.apply(lambda x: min_day(x[:len(ndvi_columns)//2]), axis=1)
    # mindr = ndvi_train.apply(lambda x: min_day(x[len(ndvi_columns)//2:]), axis=1)
    
    # train_agg['max_day'] = maxd
    # train_agg['min_day_left'] = mindl
    # train_agg['min_day_right'] = mindr
        
    # print('Days done!')
        
    # train_agg['dur_left'] = np.abs(maxd - mindl)
    # train_agg['dur_right'] = np.abs(maxd - mindr)
        
    # print('Dur done!')
            
    # for i in range(1, 10, 2):
        
    #     def day_more_quant(x):
    #         return len(np.where(x.values > np.nanquantile(x.values, i/10))[0])
    #     def mean_day_more_quant(x):
    #         return np.mean(np.where(x.values > np.nanquantile(x.values, i/10))[0])
        
    #     train_agg['more_quant_{}'.format(i)] = ndvi_train.apply(lambda x: day_more_quant(x), axis=1)
    #     train_agg['mean_more_quant_{}'.format(i)] = ndvi_train.apply(lambda x: mean_day_more_quant(x), axis=1)

    return train_agg

train_agg = ndvi_features(train)
test_agg = ndvi_features(test)

# train_fix_ndvi_agg = ndvi_features(train_fix_ndvi)
# test_fix_ndvi_agg = ndvi_features(test_fix_ndvi)

  self[col] = igetitem(value, i)
  train_agg['speed_{}_{}'.format(j, ndvi_columns[i])] = ndvi_train[ndvi_columns[i]] - ndvi_train[ndvi_columns[i-j]]


In [121]:
train_fix_ndvi.columns = ['fix_'+x for x in train_fix_ndvi.columns]
test_fix_ndvi.columns = ['fix_'+x for x in test_fix_ndvi.columns]

In [122]:
# train_fix_ndvi_agg.columns = ['fix_'+x for x in train_fix_ndvi_agg.columns]
# test_fix_ndvi_agg.columns = ['fix_'+x for x in test_fix_ndvi_agg.columns]

train_fix_ndvi.columns = ['fix_'+x for x in train_fix_ndvi.columns]
test_fix_ndvi.columns = ['fix_'+x for x in test_fix_ndvi.columns]# model

In [123]:
# cat_features = [f'nearest_labels_{i}' for i in range(10)]
# cat_features.append('most_frequent_label')
# for i in range(2, 12):
#     cat_features.append(f'most_freq_label_cosine_{i}')

In [124]:
X_train, X_val, y_train, y_val = train_test_split(pd.concat([
                                                            train.drop(['id', 'crop'], axis=1),
                                                            # train_agg,
                                                            train_fix_ndvi,
                                                            # train_fix_ndvi_agg
                                                            ], axis=1),
                                                  # .drop(ndvi_columns, axis=1), 
                                                  train['crop'],
                                                  test_size=0.1, stratify=train['crop'],
                                                  random_state=17)

In [125]:
X_train.shape

(4347, 914)

In [126]:
model = CatBoostClassifier(
    verbose=100,
    iterations=5000,
    early_stopping_rounds=500,
    loss_function='MultiClassOneVsAll',
    auto_class_weights='Balanced',
    # cat_features=cat_features
)

In [127]:
model.fit(X_train, y_train, eval_set=(X_val, y_val))

0:	learn: 0.6745421	test: 0.6745054	best: 0.6745054 (0)	total: 657ms	remaining: 54m 44s
100:	learn: 0.1384574	test: 0.1427514	best: 0.1427514 (100)	total: 45.9s	remaining: 37m 4s
200:	learn: 0.0757548	test: 0.0818781	best: 0.0818781 (200)	total: 1m 29s	remaining: 35m 45s
300:	learn: 0.0547456	test: 0.0619738	best: 0.0619738 (300)	total: 2m 15s	remaining: 35m 17s
400:	learn: 0.0437239	test: 0.0521526	best: 0.0521526 (400)	total: 3m	remaining: 34m 31s
500:	learn: 0.0380506	test: 0.0476270	best: 0.0476270 (500)	total: 3m 44s	remaining: 33m 38s
600:	learn: 0.0339517	test: 0.0447696	best: 0.0447696 (600)	total: 4m 29s	remaining: 32m 50s
700:	learn: 0.0308186	test: 0.0428340	best: 0.0428340 (700)	total: 5m 13s	remaining: 32m 4s
800:	learn: 0.0283770	test: 0.0412507	best: 0.0412507 (800)	total: 5m 56s	remaining: 31m 9s
900:	learn: 0.0264580	test: 0.0398770	best: 0.0398770 (900)	total: 6m 39s	remaining: 30m 16s
1000:	learn: 0.0248504	test: 0.0389238	best: 0.0389238 (1000)	total: 7m 22s	remaini

<catboost.core.CatBoostClassifier at 0x1b76af1f0>

In [128]:
recall_score(y_val, model.predict(X_val), average='macro')

0.9631836240045196

In [129]:
fip = pd.DataFrame()
fip['f'] = model.feature_names_
fip['v'] = model.get_feature_importance()
fip = fip.sort_values('v', ascending=False).reset_index(drop=True)

In [130]:
fip.shape

(914, 2)

In [131]:
fip.head(10)

Unnamed: 0,f,v
0,rolling_5_max_37,4.692148
1,rolling_4_max_18,1.905504
2,lat,1.566117
3,speed_1_nd_mean_2021-06-28,1.533139
4,fix_fix_nd_mean_2021-08-01,1.402458
5,rolling_3_sum_0,1.392803
6,lon,1.273159
7,rolling_2_max_40,1.238008
8,rolling_6_max_16,1.145351
9,rolling_2_max_56,1.054498


In [132]:
models = []

for i in range(100):
    X_train, X_val, y_train, y_val = train_test_split(pd.concat([
                                                            train.drop(['id', 'crop'], axis=1),
                                                            # train_agg,
                                                            train_fix_ndvi,
                                                            # train_fix_ndvi_agg
                                                            ], axis=1),
                                                  # .drop(ndvi_columns, axis=1), 
                                                  train['crop'],
                                                  test_size=0.1, stratify=train['crop'],
                                                  random_state=int(17*i))
    model = CatBoostClassifier(
        verbose=200,
        iterations=5000,
        early_stopping_rounds=500,
        loss_function='MultiClassOneVsAll',
        auto_class_weights='Balanced',
        # cat_features=cat_features
    )
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    
    if recall_score(y_val, model.predict(X_val), average='macro') > 0.977:
        models.append(model)
        
    print(recall_score(y_val, model.predict(X_val), average='macro'))

0:	learn: 0.6750540	test: 0.6751018	best: 0.6751018 (0)	total: 120ms	remaining: 10m
200:	learn: 0.0836917	test: 0.0904375	best: 0.0904375 (200)	total: 17.4s	remaining: 6m 55s
400:	learn: 0.0466732	test: 0.0560377	best: 0.0560377 (400)	total: 34s	remaining: 6m 30s
600:	learn: 0.0342697	test: 0.0463867	best: 0.0463867 (600)	total: 49.5s	remaining: 6m 2s
800:	learn: 0.0279509	test: 0.0424093	best: 0.0424093 (800)	total: 1m 4s	remaining: 5m 38s
1000:	learn: 0.0240028	test: 0.0402443	best: 0.0402443 (1000)	total: 1m 19s	remaining: 5m 18s
1200:	learn: 0.0209486	test: 0.0384879	best: 0.0384845 (1199)	total: 1m 35s	remaining: 5m
1400:	learn: 0.0184527	test: 0.0372544	best: 0.0372544 (1400)	total: 1m 51s	remaining: 4m 45s
1600:	learn: 0.0166035	test: 0.0364008	best: 0.0364008 (1600)	total: 2m 7s	remaining: 4m 30s
1800:	learn: 0.0149170	test: 0.0356856	best: 0.0356856 (1800)	total: 2m 22s	remaining: 4m 12s
2000:	learn: 0.0135134	test: 0.0351327	best: 0.0351233 (1999)	total: 2m 36s	remaining: 3m 

KeyboardInterrupt: 

In [133]:
len(models)

15

In [134]:
preds = []

for model in models:

    preds.append(model.predict_proba(pd.concat([test, 
                                                # test_agg, 
                                                test_fix_ndvi, 
                                                # test_fix_ndvi_agg
                                               ], axis=1)[model.feature_names_]))

preds = np.mean(preds, axis=0)

In [135]:
preds_binary = [np.where(x == x.max())[0][0] for x in preds]

In [136]:
test['crop'] = preds_binary

In [137]:
test[['id', 'crop']].to_csv('sub_12.csv', index=False)

# test

In [91]:
test['crop'] = model.predict(pd.concat([test, 
                                        # test_agg, 
                                        test_fix_ndvi, 
                                        # test_fix_ndvi_agg
                                       ], axis=1)[model.feature_names_])

In [92]:
test[['id', 'crop']].to_csv('sub_11_nanndvi.csv', index=False)

# skf

In [51]:
X = pd.concat([train.drop(['id', 'crop'], axis=1), 
               # train_agg, 
               # train_fix_ndvi, train_fix_ndvi_agg
              ], axis=1)
y = train['crop']

In [52]:
# X = X[fip[fip.v > 0.05].f.values]
# cat_features = [x for x in cat_features if x in X.columns]

In [53]:
100/10

10.0

In [54]:
models = []

for train_index, val_index in StratifiedKFold(n_splits=5, shuffle=True, random_state=17).split(X, y):
    
    X_train = X.iloc[train_index]
    X_val = X.iloc[val_index]
    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]
    
    model = CatBoostClassifier(
        verbose=1000,
        iterations=5000,
        early_stopping_rounds=500,
        loss_function='MultiClassOneVsAll',
        auto_class_weights='Balanced',
        class_names=[0, 1, 2, 3, 4, 5, 6],
        # cat_features=cat_features
    )
    
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    
    print(recall_score(y_val, model.predict(X_val), average='macro'))
    
    models.append(model)

0:	learn: 0.6747011	test: 0.6747853	best: 0.6747853 (0)	total: 54.9ms	remaining: 4m 34s
1000:	learn: 0.0251014	test: 0.0384527	best: 0.0384527 (1000)	total: 38.3s	remaining: 2m 33s
2000:	learn: 0.0138605	test: 0.0324835	best: 0.0324835 (2000)	total: 1m 15s	remaining: 1m 53s
3000:	learn: 0.0088285	test: 0.0300996	best: 0.0300967 (2998)	total: 1m 50s	remaining: 1m 13s
4000:	learn: 0.0062061	test: 0.0290249	best: 0.0290173 (3982)	total: 2m 26s	remaining: 36.5s
4999:	learn: 0.0046413	test: 0.0283716	best: 0.0283660 (4964)	total: 3m 2s	remaining: 0us

bestTest = 0.02836595005
bestIteration = 4964

Shrink model to first 4965 iterations.
0.9693019132249582
0:	learn: 0.6753994	test: 0.6758702	best: 0.6758702 (0)	total: 42.1ms	remaining: 3m 30s
1000:	learn: 0.0246524	test: 0.0376076	best: 0.0376076 (1000)	total: 34.6s	remaining: 2m 18s
2000:	learn: 0.0135443	test: 0.0312230	best: 0.0312230 (2000)	total: 1m 9s	remaining: 1m 43s
3000:	learn: 0.0085987	test: 0.0283882	best: 0.0283847 (2997)	total:

In [59]:
preds = []

for model in models:

    preds.append(model.predict_proba(pd.concat([test, 
                                                # test_agg, 
                                                # test_fix_ndvi, test_fix_ndvi_agg
                                               ], axis=1)[model.feature_names_]))

preds = np.mean(preds, axis=0)

In [60]:
preds_binary = [np.where(x == x.max())[0][0] for x in preds]

In [61]:
test['crop'] = preds_binary

In [58]:
test[['id', 'crop']].to_csv('sub_10_skf_deffeatures.csv', index=False)

# skf on best features (random quantity)

In [220]:
fip_values = []
for model in models:
    fip_values.append(model.get_feature_importance())
fip_values = np.mean(fip_values, axis=0)

fip = pd.DataFrame()
fip['f'] = model.feature_names_
fip['v'] = fip_values
fip = fip.sort_values('v', ascending=False).reset_index(drop=True)

In [244]:
ths = [0.0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.25]

In [245]:
models2 = []
i = 0

for train_index, val_index in StratifiedKFold(n_splits=10, shuffle=True, random_state=17).split(X, y):
    
    features = fip.loc[fip.v > ths[i], 'f'].values
    i += 1
    
    X_train = X[features].iloc[train_index]
    X_val = X[features].iloc[val_index]
    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]
    
    model = CatBoostClassifier(
        verbose=200,
        iterations=2000,
        early_stopping_rounds=500,
        loss_function='MultiClassOneVsAll',
        auto_class_weights='Balanced',
        class_names=[0, 1, 2, 3, 4, 5, 6]
    )
    
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    
    print(recall_score(y_val, model.predict(X_val), average='macro'))
    
    models2.append(model)

0:	learn: 0.6751575	test: 0.6751879	best: 0.6751879 (0)	total: 136ms	remaining: 4m 32s
200:	learn: 0.0821651	test: 0.0857267	best: 0.0857267 (200)	total: 21.6s	remaining: 3m 13s
400:	learn: 0.0456246	test: 0.0519329	best: 0.0519329 (400)	total: 39.2s	remaining: 2m 36s
600:	learn: 0.0349557	test: 0.0438038	best: 0.0438038 (600)	total: 58.7s	remaining: 2m 16s
800:	learn: 0.0292658	test: 0.0401763	best: 0.0401763 (800)	total: 1m 19s	remaining: 1m 58s
1000:	learn: 0.0252475	test: 0.0374239	best: 0.0374239 (1000)	total: 1m 39s	remaining: 1m 38s
1200:	learn: 0.0223050	test: 0.0355731	best: 0.0355731 (1200)	total: 1m 58s	remaining: 1m 18s
1400:	learn: 0.0199500	test: 0.0342222	best: 0.0342222 (1400)	total: 2m 16s	remaining: 58.5s
1600:	learn: 0.0179469	test: 0.0330418	best: 0.0330418 (1600)	total: 2m 33s	remaining: 38.2s
1800:	learn: 0.0162998	test: 0.0323264	best: 0.0323165 (1794)	total: 2m 50s	remaining: 18.8s
1999:	learn: 0.0148476	test: 0.0316808	best: 0.0316808 (1999)	total: 3m 6s	remain

In [246]:
preds = []

for model in models2:
    preds.append(model.predict_proba(pd.concat([test, test_agg], axis=1)[model.feature_names_]))
    
preds = np.mean(preds, axis=0)

In [247]:
preds_binary = [np.where(x == x.max())[0][0] for x in preds]

In [248]:
test['crop'] = preds_binary

In [249]:
test[['id', 'crop']].to_csv('sub_4_skf_bestfeatures.csv', index=False)