## Binary structure classification used in tree building

1. Create train and test sets; Save negative samples of file ``filename.rs3`` as `filename.neg`
2. Train models, save the best one.

Output:
 - ``data/*.neg``
 - ``models/structure_predictor/*``

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
import pandas as pd
import pickle
from utils.file_reading import read_edus, read_gold, read_negative, read_annotation

In [None]:
class RandomNegativeGenerator(object):
    def __call__(self, edus, corpus, annot_text):
        new_set = self.create_training_set(edus, corpus)
        result = []
        for item in new_set:
            result.append((filename, item[0], item[1], item[2]))

        tmp = pd.DataFrame(result, columns=['filename', 'snippet_x', 'snippet_y', 'relation'])

        def place_locations(row):
            row['loc_x'] = annot_text.find(row.snippet_x)
            row['loc_y'] = annot_text[row['loc_x']+len(row.snippet_x):].find(row.snippet_y)
            return row

        return tmp.apply(place_locations, axis=1)
    
    def __name__(self):
        return 'RandomNegativeGenerator'
    
    def create_training_set(self, edus, gold):
        training_set = []
        
        snippet_cache = []
        for num, e in enumerate(gold.index):
            snippet_x = gold.loc[e, 'snippet_x']
            cache_x = self.extract_snippet_ids(snippet_x, edus)

            snippet_y = gold.loc[e, 'snippet_y']
            cache_y = self.extract_snippet_ids(snippet_y, edus)

            if cache_x and cache_y:
                snippet_cache.append((cache_x, snippet_x))
                snippet_cache.append((cache_y, snippet_y))

        for i in range(len(edus) - 1):
            if not self.check_snippet_pair_in_dataset(gold, edus[i], edus[i+1]):
                training_set.append((edus[i], edus[i+1], False))

        for i in gold.index:
            training_set += self.extract_negative_samples_for_snippet(gold, edus, gold.loc[i, 'snippet_x'])
            training_set += self.extract_negative_samples_for_snippet(gold, edus, gold.loc[i, 'snippet_y'])

        for i in range(len(snippet_cache)):
            for j in range(i, len(snippet_cache)):
                cache_i, snippet_i = snippet_cache[i]
                cache_j, snippet_j = snippet_cache[j]

                if cache_i[-1] + 1 == cache_j[0]:
                    if not self.check_snippet_pair_in_dataset(gold, snippet_i, snippet_j):
                        training_set.append((snippet_i, snippet_j, False))

                if cache_j[-1] + 1 == cache_i[0]:
                    if not self.check_snippet_pair_in_dataset(gold, snippet_j, snippet_i):
                        training_set.append((snippet_j, snippet_i, False))

        return list(set(training_set))
    
    def extract_snippet_ids(self, snippet, edus):
        return [edu_nm for edu_nm, edu in enumerate(edus) if (edu in snippet)]
    
    def check_snippet_pair_in_dataset(self, dataset, snippet_left, snippet_right):
        return ((((dataset.snippet_x == snippet_left) & (dataset.snippet_y == snippet_right)).sum(axis=0) != 0) 
                or ((dataset.snippet_y == snippet_left) & (dataset.snippet_x == snippet_right)).sum(axis=0) != 0)
    
    def extract_negative_samples_for_snippet(self, gold, edus, snippet):
        training_set = []

        snippet_ids = self.extract_snippet_ids(snippet, edus)

        if not snippet_ids:
            return []

        if snippet_ids[0] > 0:
            if not self.check_snippet_pair_in_dataset(gold, snippet, edus[snippet_ids[0] - 1]):
                training_set.append((edus[snippet_ids[0] - 1], snippet, False))

        if snippet_ids[-1] < len(edus) - 1:
            if not self.check_snippet_pair_in_dataset(gold, snippet, edus[snippet_ids[-1] + 1]):
                training_set.append((snippet, edus[snippet_ids[-1] + 1], False))

        return training_set

In [None]:
import pandas as pd


class RSTTreePredictor:
    def __init__(self, features_processor, relation_predictor, label_predictor):
        self.features_processor = features_processor
        self.relation_predictor = relation_predictor
        self.label_predictor = label_predictor
        if self.label_predictor:
            self.labels = self.label_predictor.classes_
        self.genre = None

    def predict_label(self, features):
        if not self.label_predictor:
            return 'relation'

        return self.label_predictor.predict(features)


class GoldTreePredictor(RSTTreePredictor):
    def __init__(self, corpus):
        RSTTreePredictor.__init__(self, None, None, None)
        self.corpus = corpus

    def extract_features(self, *args):
        return [args[0].text, args[1].text]
    
    def initialize_features(self, *args):
        return [(args[0][i].text, args[0][i+1].text) for i in range(len(args[0]) - 1)]

    def predict_pair_proba(self, features):
        # print('>> features =', features)
        def _check_snippet_pair_in_dataset(left_snippet, right_snippet):
            return ((((self.corpus.snippet_x == left_snippet) & (self.corpus.snippet_y == right_snippet)).sum(
                axis=0) != 0)
                    or ((self.corpus.snippet_y == left_snippet) & (self.corpus.snippet_x == right_snippet)).sum(
                        axis=0) != 0)

        left_snippet, right_snippet = features
        return float(_check_snippet_pair_in_dataset(left_snippet, right_snippet))

    def predict_label(self, features):
        left_snippet, right_snippet = features
        label = self.corpus[((self.corpus.snippet_x == left_snippet) & (self.corpus.snippet_y == right_snippet))].category_id.values
        if label.size == 0:
            return 'no_relation'
        
        return label[0]
    
    def predict_nuclearity(self, features):
        left_snippet, right_snippet = features
        nuclearity = self.corpus[((self.corpus.snippet_x == left_snippet) & (self.corpus.snippet_y == right_snippet))].order.values
        if nuclearity.size == 0:
            return '_'
        
        return nuclearity[0]

In [None]:
class DiscourseUnit:
    def __init__(self, id, left=None, right=None, text='', start=None, end=None, 
                 orig_text=None, relation=None, nuclearity=None, proba=1.):
        """
        :param int id:
        :param DiscourseUnit left:
        :param DiscourseUnit right:
        :param str text: (optional)
        :param int start: start position in original text
        :param int end: end position in original text
        :param string relation: {the relation between left and right components | 'elementary' | 'root'}
        :param string nuclearity: {'NS' | 'SN' | 'NN'}
        :param float proba: predicted probability of the relation occurrence
        """
        self.id = id
        self.left = left
        self.right = right
        self.relation = relation
        self.nuclearity = nuclearity
        self.proba = str(proba)
        self.start = start
        self.end = end

        if self.left:
            gap_counter = 0
            #while len(left.text + right.text) < len(self.text):
            #    self.text = left.text + ' ' * gap_counter + right.text
            #    gap_counter += 1
            self.start = left.start
            self.end = right.end
        
        # (1) for gold tree parsing
        """
        if orig_text:            
            self.text = orig_text[self.start:self.end].strip()
        else:
            self.text = text.strip()
        """
        # (2) ??
        
        if self.left:
            self.text = ' '.join([self.left.text, self.right.text])
        else:
            self.text = orig_text[self.start:self.end].strip()
    
    def __str__(self):
        return f"id: {self.id}\ntext: {self.text}\nrelation: {self.relation}\nleft: {self.left.text if self.left else None}\nright: {self.right.text if self.right else None}\nstart: {self.start}\nend: {self.end}"


In [None]:
import sys
import numpy as np


class GreedyNegativeGenerator:
    """ Inversed greedy parser based on gold tree predictor. """
    def __init__(self):
        self.forest_threshold = 0.01
    
    def __call__(self, edus, corpus, annot_text):
        def to_merge(scores):
            return np.argmax(np.array(scores))
        
        negative_nodes = []
        
        self.tree_predictor = GoldTreePredictor(corpus)
        nodes = edus        
        max_id = edus[-1].id

        # initialize scores
        features = self.tree_predictor.initialize_features(nodes)
        scores = list(map(self.tree_predictor.predict_pair_proba, features))
        relations = list(map(self.tree_predictor.predict_label, features))
        nuclearities = list(map(self.tree_predictor.predict_nuclearity, features))

        while len(nodes) > 2 and any([score > self.forest_threshold for score in scores]):
            # select two nodes to merge
            j = to_merge(scores)  # position of the pair in list
            
            # make the new node by merging node[j] + node[j+1]
            temp = DiscourseUnit(
                id=max_id + 1,
                left=nodes[j],
                right=nodes[j + 1],
                relation=self.tree_predictor.predict_label(features[j]),
                nuclearity=self.tree_predictor.predict_nuclearity(features[j]),
                proba=scores[j],
                text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
            )
            
            max_id += 1

            # modify the node list
            nodes = nodes[:j] + [temp] + nodes[j + 2:]

            # modify the scores list
            if j == 0:
                features_right = self.tree_predictor.extract_features(nodes[j], nodes[j + 1])
                predicted = self.tree_predictor.predict_pair_proba(features_right)

                scores = [predicted] + scores[j + 2:]
                features = [features_right] + features[j + 2:]
                
                if predicted == 0:
                    relation = self.tree_predictor.predict_label(features_right)
                    if relation == 'relation':
                        negative_nodes.append(
                            DiscourseUnit(
                                id=None,
                                left=nodes[j],
                                right=nodes[j + 1],
                                relation=relation,
                                nuclearity=self.tree_predictor.predict_nuclearity(features_right),
                                proba=predicted,
                                text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
                        ))

            elif j + 1 < len(nodes):
                features_left = self.tree_predictor.extract_features(nodes[j - 1], nodes[j])
                predicted_left = self.tree_predictor.predict_pair_proba(features_left)
                if predicted_left == 0:
                    relation = self.tree_predictor.predict_label(features_left)
                    if relation == 'relation':
                        negative_nodes.append(
                            DiscourseUnit(
                                id=None,
                                left=nodes[j - 1],
                                right=nodes[j],
                                relation=relation,
                                nuclearity=self.tree_predictor.predict_nuclearity(features_left),
                                proba=predicted_left,
                                text=annot_text[nodes[j - 1].start:nodes[j].end].strip()
                        ))

                features_right = self.tree_predictor.extract_features(nodes[j], nodes[j + 1])
                predicted_right = self.tree_predictor.predict_pair_proba(features_right)
                if predicted_right == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j],
                            right=nodes[j + 1],
                            relation=self.tree_predictor.predict_label(features_right),
                            nuclearity=self.tree_predictor.predict_nuclearity(features_right),
                            proba=predicted_right,
                            text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
                    ))

                scores = scores[:j - 1] + [predicted_left] + [predicted_right] + scores[j + 2:]
                features = features[:j - 1] + [features_left] + [features_right] + features[j + 2:]

            else:
                features_left = self.tree_predictor.extract_features(nodes[j - 1], nodes[j])
                predicted = self.tree_predictor.predict_pair_proba(features_left)
                if predicted == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j - 1],
                            right=nodes[j],
                            relation=self.tree_predictor.predict_label(features_left),
                            nuclearity=self.tree_predictor.predict_nuclearity(features_left),
                            proba=predicted,
                            text=annot_text[nodes[j - 1].start:nodes[j].end].strip()
                    ))
                    
                scores = scores[:j - 1] + [predicted]
                features = features[:j - 1] + [features_left]

        if len(scores) == 1 and scores[0] > self.forest_threshold:
            root = DiscourseUnit(
                id=max_id + 1,
                left=nodes[0],
                right=nodes[1],
                relation='root',
                proba=scores[0]
            )
            nodes = [root]

        return negative_nodes
    
    def __name__(self):
        return 'GreedyNegativeGenerator'

### Make negative samples, save them

In [None]:
from tqdm import tqdm_notebook as tqdm
from utils.evaluation import extr_pairs, extr_pairs_forest

gen = RandomNegativeGenerator()
#gen = GreedyNegativeGenerator()

for filename in tqdm(glob.glob('./data/*.json')):
    filename = filename.replace('.json', '')
    df = read_gold(filename)
    edus = read_edus(filename)
    annot = read_annotation(filename)

    if gen.__name__() == 'RandomNegativeGenerator':
        tmp = gen(edus, df, annot['text'])
    
    elif gen.__name__() == 'GreedyNegativeGenerator':
        _edus = []
        last_end = 0
        for max_id in range(len(edus)):
            start = len(annot['text'][:last_end]) + annot['text'][last_end:].find(edus[max_id])
            end = start + len(edus[max_id])
            temp = DiscourseUnit(
                    id=max_id,
                    left=None,
                    right=None,
                    relation='edu',
                    start=start,
                    end=end,
                    orig_text=annot['text'],
                    proba=1.
                )
            _edus.append(temp)
            last_end = end

        tmp = gen(_edus, df, annot['text'])
        tmp = pd.DataFrame(extr_pairs_forest(tmp), columns=['snippet_x', 'snippet_y', 'category_id'])
        tmp = tmp[tmp.category_id == 'no_relation']
    
    tmp.to_json(filename + '.json.neg')

In [None]:
from tqdm import tqdm_notebook as tqdm
from utils.evaluation import extr_pairs, extr_pairs_forest

gen = RandomNegativeGenerator()
#gen = GreedyNegativeGenerator()

for filename in tqdm(glob.glob('./data/news1_16*.json')):
    filename = filename.replace('.json', '')
    df = read_gold(filename)
    edus = read_edus(filename)
    annot = read_annotation(filename)

    if gen.__name__() == 'RandomNegativeGenerator':
        tmp = gen(edus, df, annot['text'])
    
    elif gen.__name__() == 'GreedyNegativeGenerator':
        _edus = []
        last_end = 0
        for max_id in range(len(edus)):
            start = len(annot['text'][:last_end]) + annot['text'][last_end:].find(edus[max_id])
            end = start + len(edus[max_id])
            temp = DiscourseUnit(
                    id=max_id,
                    left=None,
                    right=None,
                    relation='edu',
                    start=start,
                    end=end,
                    orig_text=annot['text'],
                    proba=1.
                )
            _edus.append(temp)
            last_end = end

        tmp = gen(_edus, df, annot['text'])
        tmp = pd.DataFrame(extr_pairs_forest(tmp), columns=['snippet_x', 'snippet_y', 'category_id'])
        tmp = tmp[tmp.category_id == 'no_relation']
    
    tmp.to_json(filename + '.json.neg')

### Make a directory

In [None]:
import os

model_path = 'models/structure_predictor'
! mkdir $model_path

### Extract features

In [None]:
%%time
from utils.features_processor_default import FeaturesProcessor

features_processor = FeaturesProcessor(model_dir_path='models', verbose=False)

#### Try on the sample

In [None]:
import json

filename = 'data/news1_16'
edus = read_edus(filename)
gold = read_gold(filename)
annot = read_annotation(filename)
negatives = read_negative(filename)
negatives = negatives.drop(columns=['loc_y'])

%time result = features_processor(negatives, \
                            annot['text'],\
                            annot['tokens'],\
                            annot['sentences'],\
                            annot['lemma'],\
                            annot['morph'],\
                            annot['postag'],\
                            annot['syntax_dep_tree'])

#### Extract features from negative examples

In [None]:
! rm data/news2_6.json.neg

In [None]:
for filename in tqdm(glob.glob("data/*.json.neg")):    
    filename = filename.replace('.json.neg', '')
    
    df = read_negative(filename).drop(columns=['loc_y'])
    df = df[df.snippet_x.str.len() > 0]
    df = df[df.snippet_y.str.len() > 0]
    annotation = read_annotation(filename)
        
    try:
        result = features_processor(df, \
                                   annotation['text'],\
                                   annotation['tokens'],\
                                   annotation['sentences'],\
                                   annotation['lemma'],\
                                   annotation['morph'],\
                                   annotation['postag'],\
                                   annotation['syntax_dep_tree'])

        result.to_pickle(filename + '.neg.features')
    except IndexError:
        print('INDEX ERROR ::: FILENAME :::', filename)
        continue

#### Make train set 

In [None]:
from utils.train_test_split import split_data

train, test = split_data('./data')

In [None]:
random_state = 42
train_samples = []

for file in tqdm(train):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    train_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    train_samples.append(negative)

train_samples = pd.concat(train_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)

In [None]:
train_samples.shape

In [None]:
train_samples.relation.value_counts()

In [None]:
constants = [c for c in train_samples.columns if len(set(train_samples[c])) == 1]
to_drop = ['snippet_x', 'snippet_y', 'category_id', 'snippet_x_tmp', 'snippet_y_tmp', 'filename', 'order', 'postags_x', 'postags_y']
train_samples = train_samples.drop(columns=constants)
pickle.dump(constants+to_drop, open(os.path.join(model_path, 'drop_columns.pkl'), 'wb'))

In [None]:
TARGET = 'relation'
y_train, X_train = train_samples[TARGET].to_frame(), train_samples.drop(TARGET, axis=1).drop(columns=to_drop)

In [None]:
X_train.shape

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

categorical_feature_mask = X_train.dtypes==object
categorical_cols = X_train.columns[categorical_feature_mask].tolist()

le = LabelEncoder()
X_train[categorical_cols] = X_train[categorical_cols].apply(lambda col: le.fit_transform(col))

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_ohe = ohe.fit_transform(X_train[categorical_cols].values)
X_ohe = pd.DataFrame(X_ohe, X_train.index, columns=ohe.get_feature_names(categorical_cols))

X_train = X_train.join(
   pd.DataFrame(X_ohe, X_train.index).add_prefix('cat_'), how='right'
).drop(columns=categorical_cols)

X_train.shape

pickle.dump(categorical_cols, open(os.path.join(model_path, 'categorical_cols.pkl'), 'wb'))
pickle.dump(le, open(os.path.join(model_path, 'label_encoder.pkl'), 'wb'))
pickle.dump(ohe, open(os.path.join(model_path, 'one_hot_encoder.pkl'), 'wb'))

#### Make test set

In [None]:
random_state = 42
test_samples = []

for file in tqdm(test):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    test_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    test_samples.append(negative)

test_samples = pd.concat(test_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)

In [None]:
TARGET = 'relation'
y_test, X_test = test_samples[TARGET].to_frame(), test_samples.drop(TARGET, axis=1).drop(columns=to_drop+['category_id']+constants)

In [None]:
X_train.shape, X_test.shape

### Classifiers training 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler

std_scaler = MinMaxScaler().fit(X_train.values)

X_train = pd.DataFrame(std_scaler.transform(X_train.values), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(std_scaler.transform(X_test.values), index=X_test.index, columns=X_test.columns)

scaler_path = os.path.join(model_path, 'scaler.pkl')
with open(scaler_path, 'wb') as f:
    pickle.dump(std_scaler, f)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


model = LogisticRegression(solver='lbfgs', C=0.0005, n_jobs=4, class_weight='balanced', random_state=random_state)
model.fit(X_train, y_train)

In [None]:
from sklearn import metrics


predicted = model.predict(X_test)
print('pr:', metrics.precision_score(y_test, predicted))
print('re:', metrics.recall_score(y_test, predicted))
print('f1:', metrics.f1_score(y_test, predicted))
print()
print(metrics.classification_report(y_test, predicted))

In [None]:
from sklearn.svm import LinearSVC

svc = LinearSVC(random_state=random_state, C=0.01, class_weight='balanced')
svc.fit(X_train, y_train)

In [None]:
from sklearn import metrics


model = svc
predicted = model.predict(X_test)
print('pr:', metrics.precision_score(y_test, predicted))
print('re:', metrics.recall_score(y_test, predicted))
print('f1:', metrics.f1_score(y_test, predicted))
print()
print(metrics.classification_report(y_test, predicted))

In [None]:
pickle.dump(svc, open(os.path.join(model_path, 'model.pkl'), 'wb'))
pickle.dump(std_scaler, open(os.path.join(model_path, 'scaler.pkl'), 'wb'))

In [None]:
lgbm_param_bin =  {
    'tree_learner': 'feature',
    'task': 'train',
    'random_state': random_state,
    'metric': 'binary_logloss',
    'feature_fraction': 0.8,
    'boosting_type': 'dart',
    'application': 'binary',
    'num_iterations': 300,
    'max_depth' : 5,
    'is_unbalance' : True,
    'n_estimators' : 300,
    'colsample_bytree' : 0.8
}
model = lgb.LGBMClassifier(**lgbm_param_bin)
model.fit(X_train, y_train)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

lgbm_param_bin =  {
    'tree_learner': 'feature',
    'task': 'train',
    'random_state': random_state,
    'metric': 'binary_logloss',
    'feature_fraction': 0.8,
    'boosting_type': 'dart',
    'application': 'binary',
    'num_iterations': 300,
    'max_depth' : 5,
    'is_unbalance' : True,
    'n_estimators' : 300,
    'colsample_bytree' : 0.8
}
classifier = lgb.LGBMClassifier(**lgbm_param_bin)
feature_selector = SelectFromModel(LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1'))
model = Pipeline([('feature_selector', feature_selector), 
                   ('classifier', classifier)])
model.fit(X_train, y_train)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
import lightgbm.sklearn as lgb

random_state = 41
lgbm_param_bin =  {
    'tree_learner': 'feature',
    'task': 'train',
    'random_state': random_state,
    'metric': 'binary_logloss',
    'feature_fraction': 0.8,
    'boosting_type': 'dart',
    'application': 'binary',
    'num_iterations': 600,
    'max_depth' : 6,
    'is_unbalance' : True,
    'n_estimators' : 600,
    'colsample_bytree' : 0.8
}
classifier = lgb.LGBMClassifier(**lgbm_param_bin)
feature_selector = SelectFromModel(LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1'))
model_single = Pipeline([('feature_selector', feature_selector), 
                   ('classifier', classifier)])

model = BaggingClassifier(base_estimator=model_single, 
                          n_estimators=3, 
                          max_samples=1.0, 
                          max_features=0.8, 
                          bootstrap=True, 
                          random_state=random_state)


#model.fit(X_train, y_train)

In [None]:
from sklearn import metrics

classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))  # here

In [None]:
import numpy as np

fi = np.array(classifier.feature_importances_)
sorted_idx = np.argsort(fi)
print(np.count_nonzero(fi))

In [None]:
pd.set_option('display.max_rows', 150)
#start, finish = 0, 2000
dd = pd.DataFrame({'Feature': np.array(X_test.keys())[sorted_idx], 'Importance': fi[sorted_idx][::-1]})
dd = dd[dd['Importance'] > 0]

In [None]:
dd[dd.Feature.str[-2:] != '_y']

In [None]:
from sklearn import metrics

classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

In [None]:
file_path = 'predictor_relation_presence_classifier.pkl'
with open(file_path, 'wb') as f:
    pickle.dump(classifier, f)

In [None]:
y_train.relation.value_counts()

In [None]:
from catboost import CatBoostClassifier, Pool

model = CatBoostClassifier(one_hot_max_size=5,
                           learning_rate=.03,
                           iterations=25000,
                           scale_pos_weight=3.,
                           depth=2,
                           score_function='SolarL2',
                           random_state=random_state,
                           task_type='GPU',
                           devices='0',
                           #task_type="GPU"
                          )

model.fit(X_train,
          y_train.astype(float),
          eval_set=Pool(X_test, y_test.astype(float)),
          verbose=False,
          plot=True)

In [None]:
predicted = model.predict(X_test)
print('pr:', metrics.precision_score(y_test, predicted))
print('re:', metrics.recall_score(y_test, predicted))
print('f1:', metrics.f1_score(y_test, predicted))
print()
print(metrics.classification_report(y_test, predicted))

In [None]:
probas = model.predict_proba(X_test)

In [None]:
probas

In [None]:
predicted[predicted == 0.].shape

In [None]:
predicted[predicted == 1.].shape

In [None]:
predicted = model.predict(X_test)
print('pr:', metrics.precision_score(y_test, predicted))
print('re:', metrics.recall_score(y_test, predicted))
print('f1:', metrics.f1_score(y_test, predicted))
print()
print(metrics.classification_report(y_test, predicted))

In [None]:
pickle.dump(model, open(os.path.join(model_path, 'model.pkl'), 'wb'))
pickle.dump(std_scaler, open(os.path.join(model_path, 'scaler.pkl'), 'wb'))