In [1]:
from base_layer_utils import BaseLayerDataRepo, BaseLayerResultsRepo, ModelName
from base_layer_utils import SklearnBLE
from base_layer_utils import compute_layer2_oof
import pandas as pd
import numpy as np
import time, re, gc
from sklearn.metrics import roc_auc_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
PATH = '~/data/toxic/data/'

train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')

print(train.shape)
print(test.shape)

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

(159571, 8)
(153164, 2)


# stacking

In [3]:
def combine_layer_oof_per_label(layer1_oof_dict, label):
    """
    Util method for stacking
    """
    x = None
    data_list = layer1_oof_dict[label]
    for i in range(len(data_list)):
        if i == 0:
            x = data_list[0]
        else:
            x = np.concatenate((x, data_list[i]), axis=1)
    return x

In [4]:
# load the saved repo. IMPORTANT: set load_from_file to True! or you will overwrite the saved repo
base_layer_results_repo = BaseLayerResultsRepo(load_from_file=True, filepath='obj/WithPreprocessedFile/')

load from file


In [5]:
scores = base_layer_results_repo.show_scores()

0.9888	ModelName.NBLOGREG_tfidf_word_(1, 1)_30000_1_1.0
0.9777	ModelName.LOGREG_tfidf_word_(1, 1)_30000_1_1.0
0.9666	ModelName.LOGREG_PERLABEL_tfidf_word_(1, 1)_30000_1_1.0


# now we will construct a logreg model and a lightgbm model using different layer1 model_data at layer 2

In [6]:
model_pool = {}
layer2_inputs = {}

from sklearn.linear_model import LogisticRegression
model_pool[ModelName.LOGREG] = SklearnBLE(LogisticRegression)
layer2_inputs[ModelName.LOGREG] = base_layer_results_repo.get_results(threshold=0.95)

from lightgbm import LGBMClassifier
model_pool[ModelName.LGB] = SklearnBLE(LGBMClassifier)
selected = ['ModelName.NBLOGREG_tfidf_word_(1, 1)_30000_1_1.0',
            'ModelName.LOGREG_PERLABEL_tfidf_word_(1, 1)_30000_1_1.0']
layer2_inputs[ModelName.LGB] = base_layer_results_repo.get_results(chosen_ones=selected)

In [7]:
model_pool

{<ModelName.LGB: 5>: <base_layer_utils.SklearnBLE at 0x7f09b5b69828>,
 <ModelName.LOGREG: 9>: <base_layer_utils.SklearnBLE at 0x7f09aed825f8>}

In [8]:
import gc
gc.collect()

0

In [9]:
layer2_est_preds, layer2_oof_train, layer2_oof_test, layer2_model_data_list = compute_layer2_oof(model_pool, layer2_inputs, train, label_cols, 4, 1001)

Generating Layer2 model ModelName.LOGREG OOF
Generating Layer2 model ModelName.LGB OOF


### sanity check layer 2 model_data before add it to a repo

In [10]:
len(layer2_oof_train['toxic']) # number of model_data just created

2

In [11]:
layer2_oof_train['toxic'][0].shape

(159571, 1)

In [14]:
list(layer2_est_preds) # list of layer2 model_data just created

['ModelName.LGB_layer2', 'ModelName.LOGREG_layer2']

In [15]:
layer2_est_preds[list(layer2_est_preds.keys())[0]].shape

(153164, 6)

### NOTE: you can add layer2 model_data to the base repo, or create another data repo and save them. 
### Here we will save them to the base repo, because at layer 3, you might want to build a model using model_data from both layer1 and layer2

In [16]:
base_layer_results_repo.add(layer2_oof_train, layer2_oof_test, layer2_est_preds, layer2_model_data_list)

In [17]:
_ = base_layer_results_repo.show_scores()

0.9888	ModelName.NBLOGREG_tfidf_word_(1, 1)_30000_1_1.0
0.9777	ModelName.LOGREG_tfidf_word_(1, 1)_30000_1_1.0
0.9666	ModelName.LOGREG_PERLABEL_tfidf_word_(1, 1)_30000_1_1.0
0	ModelName.LGB_layer2
0	ModelName.LOGREG_layer2


In [18]:
# give it some fake score
base_layer_results_repo.add_score('ModelName.LGB_layer2', 0.09911)
base_layer_results_repo.add_score('ModelName.LOGREG_layer2', 0.09922)

ModelName.LGB_layer2 already existed in the repo. score: 0 update to 0.09911
ModelName.LOGREG_layer2 already existed in the repo. score: 0 update to 0.09922


In [19]:
_ = base_layer_results_repo.show_scores()

0.9888	ModelName.NBLOGREG_tfidf_word_(1, 1)_30000_1_1.0
0.9777	ModelName.LOGREG_tfidf_word_(1, 1)_30000_1_1.0
0.9666	ModelName.LOGREG_PERLABEL_tfidf_word_(1, 1)_30000_1_1.0
0.09922	ModelName.LOGREG_layer2
0.09911	ModelName.LGB_layer2


In [20]:
# save it
base_layer_results_repo.save()

In [21]:
def write_predictions_to_file(base_layer_est_preds):
    for key in base_layer_est_preds:
        submission = pd.read_csv(PATH + 'sample_submission.csv')#.head(1000)
        submission[label_cols] = base_layer_est_preds[key]
        sub_id = int(time.time())
        print(sub_id)
        submission.to_csv('./BaseEstPreds/' + key + '_' + str(sub_id) + '.csv', index=False)

In [22]:
write_predictions_to_file(layer2_est_preds)

1522100333
1522100334


# Now build a logreg at stacknet layer 3

In [23]:
selected_for_layer3 = ['ModelName.LOGREG_layer2',
                       'ModelName.LGB_layer2']

In [24]:
layer3_model_pool = {}
layer3_inputs = {}

from sklearn.linear_model import LogisticRegression
layer3_model_pool[ModelName.LOGREG] = SklearnBLE(LogisticRegression)
# because you saved layer2 to base_layer_results_repo, we retrieve data from it
layer3_inputs[ModelName.LOGREG] = base_layer_results_repo.get_results(chosen_ones=selected_for_layer3)

In [25]:
layer3_est_preds, layer3_oof_train, layer3_oof_test, layer3_model_data_list = compute_layer2_oof(layer3_model_pool, layer3_inputs, train, label_cols, 4, 1001)

Generating Layer2 model ModelName.LOGREG OOF


# if layer3 is the last layer, then just use write_predictions_to_file to convert the layer3_est_preds to a prediction file and submit it, like this:

In [26]:
write_predictions_to_file(layer3_est_preds)

1522101283


# Otherwise, you have layer3_oof_train and layer3_oof_test for even higher layer stack

# GOOD LUCK!!!!!

In [28]:
print('good luck')

good luck
