# Using scikit-learn calibration

In [ ]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier 
from sklearn.metrics import log_loss
from sklearn.calibration import CalibratedClassifierCV

print "hello"

# Import Data (Kaggle OTTO challenge)

In [ ]:
X = pd.read_csv('../train.csv')
X = X.drop('id', axis=1)

# Extract target
# Encode it to make it manageable by ML algo
y = X.target.values
y = LabelEncoder().fit_transform(y)

# Remove target from train, else it's too easy ...
X = X.drop('target', axis=1)

X.head(5)

# Split Train / Test

In [ ]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.20, random_state=36)

# Train and apply a Random Forest (without calibration)

In [ ]:
clf = RandomForestClassifier(n_estimators=250, n_jobs=-1)
# we use a BaggingClassifier to make 5 predictions, and average
# beacause that's what CalibratedClassifierCV do behind the scene
# and we want to compare things fairly
clfbag = BaggingClassifier(clf, n_estimators=5)
clfbag.fit(Xtrain, ytrain)
ypreds = clfbag.predict_proba(Xtest)
print "%.2f" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)

# Train and apply a Random Forest (with calibration)

In [ ]:
clf = RandomForestClassifier(n_estimators=250, n_jobs=-1)
# in our case, 'isotonic' works better than default 'sigmoid'
calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5)
calibrated_clf.fit(Xtrain, ytrain)
ypreds = calibrated_clf.predict_proba(Xtest)
print "%.2f" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)

# We highly improved performance with calibration !