In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline

In [2]:
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score

sig_data = pd.read_csv('toy_datasets/toyMC_sig_mass.csv', sep='\t')
bck_data = pd.read_csv('toy_datasets/toyMC_bck_mass.csv', sep='\t')

labels = np.array([1] * len(sig_data) + [0] * len(bck_data))
data = pd.concat([sig_data, bck_data])
variables = ["FlightDistance", "FlightDistanceError", "IP", "VertexChi2", "pt", "p0_pt", "p1_pt", "p2_pt", 'LifeTime','dira']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.5)

In [4]:
X_train.shape

(72449, 40)

#Neurolab

Тип сети задаётся опциональным параметром net_type. fit и predict работают в полном соответствии с sklearn'овской спецификацией, никаких предварительных преобразований входных данных производить не нужно. Также, в соответствии с идеологией REP, поддерживается аргумент features, отвечающий за то, по каким признакам идёт построение модели.

In [5]:
import neurolab as nl
f2 = nl.trans.SoftMax()
f = nl.trans.LogSig()
from rep.estimators import NeurolabClassifier
clf = NeurolabClassifier(show=1, size=[300], transf=[f, f], epochs=3, trainf=nl.train.train_rprop, features=variables)

In [6]:
%time _ = clf.fit(X_train, y_train)

Epoch: 1; Error: 18112.25;
Epoch: 2; Error: 10254.4734708;
Epoch: 3; Error: 10076.9587201;
The maximum number of train epochs is reached
CPU times: user 29.9 s, sys: 64.8 ms, total: 29.9 s
Wall time: 29.9 s


In [7]:
predict_labels = clf.predict(X_test)
predict_proba = clf.predict_proba(X_test)

In [8]:
print predict_labels
print predict_proba

[1 1 1 ..., 1 1 1]
[[ 0.00845082  0.99154918]
 [ 0.00849629  0.99150371]
 [ 0.00787892  0.99212108]
 ..., 
 [ 0.00807403  0.99192597]
 [ 0.00787546  0.99212454]
 [ 0.00791339  0.99208661]]


In [9]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predict_proba[:, 1])

0.66746001380620334

Проверим, что set_params работает

In [10]:
clf.set_params(epochs=10, show=0)
%time clf.fit(X_train, y_train)

CPU times: user 1min 40s, sys: 160 ms, total: 1min 40s
Wall time: 1min 40s


NeurolabClassifier(features=['FlightDistance', 'FlightDistanceError', 'IP', 'VertexChi2', 'pt', 'p0_pt', 'p1_pt', 'p2_pt', 'LifeTime', 'dira'],
          initf=<function init_zeros at 0x7fe6ddb33050>, net_type=None,
          trainf=<neurolab.core.Trainer object at 0x7fe6ddb36f10>)

In [11]:
predict_proba = clf.predict_proba(X_test)

In [12]:
roc_auc_score(y_test, predict_proba[:, 1])

0.71773685427031753

In [13]:
from sklearn.metrics import classification_report
from sklearn.metrics import zero_one_loss

print "Accuracy:", zero_one_loss(y_test, predict_labels)
print "Classification report:"
print classification_report(y_test, predict_labels)

Accuracy: 0.140869565217
Classification report:
             precision    recall  f1-score   support

          0       0.00      0.00      0.00     10206
          1       0.86      1.00      0.92     62244

avg / total       0.74      0.86      0.79     72450



  'precision', 'predicted', average, warn_for)


Проверим, что сеть нормально (де-)сериализуется на диск

In [14]:
import pickle

pickle.dump(clf, open("dump.p", "wb"))
clf_loaded = pickle.load(open("dump.p", "rb"))

In [15]:
predict_proba = clf_loaded.predict_proba(X_test)
roc_auc_score(y_test, predict_proba[:, 1])

0.71773685427031753