In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline

In [2]:
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score

sig_data = pd.read_csv('../toy_datasets/toyMC_sig_mass.csv', sep='\t')
bck_data = pd.read_csv('../toy_datasets/toyMC_bck_mass.csv', sep='\t')

labels = np.array([1] * len(sig_data) + [0] * len(bck_data))
data = pd.concat([sig_data, bck_data])
variables = ["FlightDistance", "FlightDistanceError", "IP", "VertexChi2", "pt", "p0_pt", "p1_pt", "p2_pt", 'LifeTime','dira']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.5)

In [4]:
X_train.shape

(72449, 40)

#Neurolab

Тип сети задаётся опциональным параметром net_type. fit и predict работают в полном соответствии с sklearn'овской спецификацией, никаких предварительных преобразований входных данных производить не нужно. Также, в соответствии с идеологией REP, поддерживается аргумент features, отвечающий за то, по каким признакам идёт построение модели.

In [12]:
import neurolab as nl
f2 = nl.trans.SoftMax()
f = nl.trans.LogSig()
from rep.estimators import NeurolabClassifier
clf = NeurolabClassifier(features=variables, show=1, transf=[f, f, f])

In [19]:
%time clf.fit(X_train, y_train)

Epoch: 1; Error: 18112.25;
Epoch: 2; Error: 10252.4735735;
Epoch: 3; Error: 9965.43621617;
Epoch: 4; Error: 26737.2827356;
Epoch: 5; Error: 12012.6412675;
Epoch: 6; Error: 9006.56659507;
Epoch: 7; Error: 9025.80858723;
Epoch: 8; Error: 8748.69865159;
Epoch: 9; Error: 8790.338461;
Epoch: 10; Error: 8730.6051094;
The maximum number of train epochs is reached
CPU times: user 2min 21s, sys: 208 ms, total: 2min 21s
Wall time: 2min 21s


In [16]:
predict_labels = clf.predict(X_test)
predict_proba = clf.predict_proba(X_test)

In [17]:
print predict_labels
print predict_proba

[1 1 1 ..., 1 1 1]
[[ 0.14969219  0.85030781]
 [ 0.16340585  0.83659415]
 [ 0.14998982  0.85001018]
 ..., 
 [ 0.14882971  0.85117029]
 [ 0.14423919  0.85576081]
 [ 0.14883047  0.85116953]]


In [18]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predict_proba[:, 1])

0.71249081962693095

Проверим, что set_params работает

In [31]:
clf.set_params(epochs=50, show=0)
%time clf.fit(X_train, y_train)

CPU times: user 12min, sys: 933 ms, total: 12min 1s
Wall time: 12min 1s


In [23]:
predict_proba = clf.predict_proba(X_test)

In [24]:
roc_auc_score(y_test, predict_proba[:, 1])

0.82945868098368991

In [30]:
from sklearn.metrics import classification_report
from sklearn.metrics import zero_one_loss

print "Accuracy:", zero_one_loss(y_test, predict_labels)
print "Classification report:"
print classification_report(y_test, predict_labels)

Accuracy: 0.138881987578
Classification report:
             precision    recall  f1-score   support

          0       0.88      0.02      0.03     10208
          1       0.86      1.00      0.93     62242

avg / total       0.86      0.86      0.80     72450



Проверим, что сеть нормально (де-)сериализуется на диск

In [27]:
import pickle

pickle.dump(clf, open("dump.p", "wb"))
clf_loaded = pickle.load(open("dump.p", "rb"))

In [29]:
predict_proba = clf_loaded.predict_proba(X_test)
roc_auc_score(y_test, predict_proba[:, 1])

0.82945868098368991