<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Adjusting-threshold" data-toc-modified-id="Adjusting-threshold-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Adjusting threshold</a></span><ul class="toc-item"><li><span><a href="#Random-Forests" data-toc-modified-id="Random-Forests-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Random Forests</a></span></li><li><span><a href="#Load-other-models-and-make-predictions" data-toc-modified-id="Load-other-models-and-make-predictions-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load other models and make predictions</a></span></li><li><span><a href="#Adjust-the-decision-the-threshold" data-toc-modified-id="Adjust-the-decision-the-threshold-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Adjust the decision the threshold</a></span></li></ul></li><li><span><a href="#XGBoost" data-toc-modified-id="XGBoost-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>XGBoost</a></span></li></ul></div>

In [3]:
import pdb 
import glob
import copy
import math
import pickle

import numpy as np
import pandas as pd
import scipy as sp

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
 
import missingno  # for visualizing missing data

import xgboost as xgb

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, \
    GridSearchCV, ShuffleSplit

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, \
    LogisticRegressionCV, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report, \
    precision_recall_curve, average_precision_score, f1_score, \
    roc_curve, auc, roc_auc_score, make_scorer,\
    accuracy_score, balanced_accuracy_score

from sklearn.externals import joblib
from sklearn.utils import resample
from sklearn.utils.fixes import signature


# Set up pandas table display
pd.set_option('display.width', 120)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# Set plotting options
sns.set() # Use seaborn defaults for plotting
%matplotlib inline 


# Adjust number of CPU cores to use
n_jobs=1

In [5]:
# Load preprocessed training and test set, incl. feature names 
X_train_small = joblib.load('data_processed/X_train_small.joblib')
X_test_small = joblib.load('data_processed/X_test_small.joblib')
y_train_small = joblib.load('data_processed/y_train_small.joblib')
y_test_small = joblib.load('data_processed/y_test_small.joblib')

### Adjusting threshold

#### Random Forests

In [9]:
rf = RandomForestClassifier(n_estimators=100, 
        oob_score=False, n_jobs=n_jobs, random_state=1,
        class_weight='balanced_subsample')
rf.fit(X_train_small, y_train_small)

In [8]:
# Save results
average_precision['random forests'] = \
    average_precision_score(y_test_small, y_pred_proba_rf)
classification_reports['random forests'] = \
    classification_report(y_test_small, y_pred_rf)

# Compute feature importance and sort
most_important_features['random forests'] = \
    pd.Series(rf.feature_importances_, index=feature_names) \
            .sort_values(ascending=False) \
            .iloc[: 10]

NameError: name 'y_small' is not defined

#### Load other models and make predictions

In [12]:
# Load models
lr_gs_1 = joblib.load('saved_models/lr_gs_1.joblib')
svm_lin_gs_1 = joblib.load('saved_models/svm_lin_gs_1.joblib')
svm_rbf_gs_1 = joblib.load('saved_models/svm_rbf_gs_1.joblib')
svm_poly_gs_1 = joblib.load('saved_models/svm_poly_gs_1.joblib')

In [20]:
# Predicted probability or distance from separating hyperplane
y_pred_proba_rf = rf.predict_proba(X_test_small)[:, 1]
y_pred_proba_lr_1 = lr_gs_1.predict_proba(X_test_small)[:, 1]
y_pred_distance_svm_lin_1 = svm_lin_gs_1.decision_function(X_test_small)
y_pred_distance_svm_rbf_1 = svm_rbf_gs_1.decision_function(X_test_small)
y_pred_distance_svm_poly_1 = svm_poly_gs_1.decision_function(X_test_small)

#### Adjust the decision the threshold

In [31]:
# Define function to adjust threshold
def predict_class(threshold, probabilities=None, distances=None):
    """
    Predict the class from probabilities or distance to separating hyperplane, 
    given a decision threshold.
    """
    
    # If probabilities are passed:
    if probabilities is not None:
        # First make sure no distances are passed
        if distances is not None:
            raise Exception('Please apply EITHER probabilities OR distances.')
        # Classify as 1 if probabilities are greater than threshold, 0 otherwise.
        classes = (probabilities > threshold).astype(int)
        
    # If distances are passed:
    if distances is not None:
        # First make sure  probabilities are not passed as well
        if probabilities is not None:
            if distances is not None:
                raise Exception('Please apply EITHER probabilities OR distances.')
        # Classify as 1 if distance is greater than the threshold, Zero otherwise.
        classes = (distances > threshold).astype(int)
        
    return classes

predict_class(.4, distances=y_pred_distance_svm_lin_1)

array([0, 0, 0, ..., 1, 0, 0])

In [28]:
pd.Series(y_pred_distance_svm_lin_1). describe()

count    10000.000000
mean        -0.157505
std          0.842071
min         -3.460922
25%         -0.707586
50%         -0.151110
75%          0.409694
max          4.310334
dtype: float64

### XGBoost

In [7]:
data_xgb_train = xgb.DMatrix(data=X_train_small, label=y_train_small)

In [None]:
# # specify parameters via map
# param = {'objective':'binary:logistic', 'eval_metric':'map',
#          'scale_pos_weight':5  # Balance class weight
#          'seed':0}
# num_round = 2

# train(param, X_train_small, num_round)
# y_pred = xgb_.predict(X_test_small)

In [5]:
xgb_ = xgb.XGBClassifier(objective='binary:logistic', eval_metric='map',
                         scale_pos_weight=5,  # Balance class weight
                         seed=0)
xgb_.fit(X_train_small, y_train_small)
y_xgb = xgb_.predict(X_test_small)
average_precision_score(y_test_small, y_xgb)

TypeError: fit() missing 1 required positional argument: 'y'

In [None]:
params = {'objective':'binary:logistic', 'eval_metric':'map',
         'scale_pos_weight':5,  # Balance class weight
         'seed':0}
xgb_cv = xgb.cv(dtrain=data_xgb_train, params=params, nfold=3,
                num_boost_round=50, as_pandas=True)

In [1]:
from fastai import train_cats

ModuleNotFoundError: No module named 'fastai'