In [None]:
from preprocess import *
from utils import *
from reports import *
from train import *

In [None]:
train_bands, train_id, bands, le = read('hyper_bands_train.csv',
                                    'species_id_train.csv',
                                    'hyper_bands.csv')

test_bands, test_id = read_test('hyper_bands_test.csv',
                                'task3_ecodse.csv',
                                le)

In [None]:
X_TEST, y_TEST, TEST_cr, TEST_id = prepare_testing_data(test_bands, test_id, bands)

## Random Forest

In [None]:
params = {'max_depth': 100, 'min_impurity_decrease': 0.001, 'min_samples_split': 2, 'n_estimators': 1000}
RF_clf = RF(params=params, encoder = le)

In [None]:
RF_clf.fit(train_bands, train_id, bands)

In [None]:
probs = RF_clf.predict_proba(X_TEST)
real_report(TEST_id, probs, RF_clf.model, TEST_cr, le)

## Cat Boost

In [None]:
params = {'n_estimators': 74, 'learning_rate': 0.08048020964881546, 'depth': 8, 'subsample': 0.434457987840736, 'colsample_bylevel': 0.43083256379219936, 'min_data_in_leaf': 18}
CB_clf = CB(encoder = le)

In [None]:
CB_clf.fit(train_bands, train_id, bands)

In [None]:
probs = CB_clf.predict_proba(X_TEST)
real_report(TEST_id, probs, CB_clf.model, TEST_cr, le)

## XGBoost

In [None]:
params = {'colsample_bylevel': 0.396288472513028, 'eta': 1.0235936668221488, 'gamma': 5.582677202898291, 'max_depth': 9, 'n_estimators': 1331, 'subsample': 0.4203918921079012}
XGB_clf = XGB(params = params, encoder = le)

In [None]:
XGB_clf.fit(train_bands, train_id, bands)

In [None]:
XGB_clf.calibrated

In [None]:
probs = XGB_clf.predict_proba(X_TEST)
real_report(TEST_id, probs, XGB_clf.model, TEST_cr, le)

In [None]:
thresh = 3
n_components = 20
calib_size = 0.25
test_size = 0.25
n_samples = 400

train_id = train_id[['crown_id', 'species_id']]
X, y = train_bands.drop(columns = ['chm']),\
        pd.merge(train_bands[['crown_id']],
                train_id,
                on='crown_id', how='inner')


# drop bad bands
X = drop_bands(X, bands)

# drop outliers
X, y = drop_outliers(X, y, thresh)

# PCA transform
X = tansform(X, n_components)
X = pd.concat(
    [y[['crown_id']], pd.DataFrame(index = y.index, data = X)], axis = 1
) 
X.reset_index(inplace=True, drop = True)

# train-calibrare-test split based on crown IDs
train_df, calib_df, test_df,\
    train_id, calib_id, test_id = split(X, y, train_id, calib_size, test_size)

# resample
X_train, y_train, X_calib,\
    y_calib, X_test, y_test = resample(train_df, calib_df, test_df, n_samples, calib_size, test_size)

# prepare splits for cross-validation
cv = cv_split(pd.concat([X_train, X_calib, X_test], axis = 0), pd.concat([train_id, calib_id, test_id], axis = 0), 5)


X_train, y_train, = X_train.iloc[:, 1:].values, y_train.iloc[:, 1].values
X_calib, y_calib, = X_calib.iloc[:, 1:].values, y_calib.iloc[:, 1].values
X_test, y_test, test_cr =\
    X_test.iloc[:, 1:].values, y_test.iloc[:, 1].values, X_test.iloc[:, 0].values,

In [None]:
X_cv = np.vstack([X_train, X_calib, X_test])
y_cv = np.hstack([y_train, y_calib, y_test])

In [None]:
cross_val_score(XGB_clf.model, X_cv, y_cv, scoring='accuracy', cv = cv).mean().round(4)

In [None]:
rf_accuracy = []
rf_std = []
cb_accuracy = []
cb_std = []
xgb_accuracy = []
xgb_std = []
from tqdm import tqdm

In [None]:
for n_components in tqdm(np.arange(10, 100, 10)):
    X_train, y_train, X_calib, y_calib, X_test, y_test,\
        test_cr, test_id, train_cv = prepare_training_data(train_bands, train_id, bands, n_components=n_components)

    res = cross_val_score(RF_clf.model, X_train, y_train, cv=train_cv, scoring='accuracy')
    rf_accuracy.append(np.mean(res))
    rf_std.append(np.std(res))

    res = cross_val_score(CB_clf.model, X_train, y_train, cv=train_cv, scoring='accuracy')
    cb_accuracy.append(np.mean(res))
    cb_std.append(np.std(res))

    res = cross_val_score(XGB_clf.model, X_train, y_train, cv=train_cv, scoring='accuracy')
    xgb_accuracy.append(np.mean(res))
    xgb_std.append(np.std(res))