In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from sklearn import metrics

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [3]:
import os
from sys import argv
import pdb

In [4]:
def get_data(data_dir):
    fs = [data_dir + f for f in os.listdir(data_dir) if ('signal' in f or 'WZ' in f) and f[0] != '.']
    df = pd.DataFrame()

    for f in fs:
        print f
        new_df = pd.read_csv(f)
        df = pd.concat([df, new_df], ignore_index = True)
        df.index = range(len(df))

    return df

In [5]:
def add_cl_ix(df):
    df['is_sig'] = [1 if 'signal' in val else 0 for val in df.cl.values]
    return df

In [61]:
class WWdataset(Dataset):

    def __init__(self, pd_dataset):
        self.dataset = pd_dataset

        self.input_vars = [col for col in self.dataset.columns if not col in ['runNumber', 'lbNumber', 'eventNumber', 'SFOS', 'is_sig', 'weight', 'cl', 'preds']]

        self.target_var = ['is_sig']
        self.weight_var = ['weight']

        self.input_np = self.dataset[self.input_vars].as_matrix().astype(dtype=np.float32)
        self.target_np = self.dataset[self.target_var].as_matrix().astype(dtype=int)
        self.weight_np =self.dataset[self.weight_var].as_matrix().astype(dtype=np.float32)

        self.inputs = torch.from_numpy(self.input_np)
        self.target = torch.from_numpy(self.target_np)
        self.weight = torch.from_numpy(self.weight_np)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        inputs = self.inputs[idx]
        target = self.target[idx]
        weight = self.weight[idx]
        return inputs, target, weight

    def n_input(self):
        return len(self.input_vars)

In [51]:
def net_logistic_regression(n_input):
    model = torch.nn.Sequential(
        torch.nn.Linear(n_input, 2),
    )
    return model

In [57]:
def net_deep_logistic_regression(n_input):
    model = torch.nn.Sequential(
        torch.nn.Linear(n_input, 100),
        torch.nn.ReLU(),
        torch.nn.Linear(100, 10),
        torch.nn.ReLU(),
        torch.nn.Linear(10, 2),
    )
    return model

In [62]:
data_dir = "../data/"
pandas_dataset = add_cl_ix(get_data(data_dir))

in_dataset = WWdataset(pandas_dataset)
trainloader = torch.utils.data.DataLoader(in_dataset, batch_size=200, shuffle=True, num_workers=2)

../data/bkg_WZqqll.csv
../data/signal_WmWpWm.csv
../data/bkg_WZlvll.csv
../data/signal_WpWpWm.csv


In [58]:
# net = net_logistic_regression(in_dataset.n_input())
net = net_deep_logistic_regression(in_dataset.n_input())
criterion = nn.CrossEntropyLoss(reduce=False)
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

# Training

In [59]:
for epoch in range(3):
        print
        print "epoch: ", epoch
        running_loss = 0.
        for i, data in enumerate(trainloader):
            inputs, label, weight = data
            inputs, label, weight = Variable(inputs), Variable(label), Variable(weight)

            optimizer.zero_grad()
            output = net(inputs)
            losses = criterion(output, label.squeeze())
            loss = (losses * weight.squeeze()).sum()
            loss.backward()
            optimizer.step()

            running_loss += loss.data[0]
            if i % 200 == 199:    # print every 2000 mini-batches
                print "batch:  {}, loss: {}".format(i+1, running_loss/(i+1))


epoch:  0
batch:  200, loss: 0.125596643128
batch:  400, loss: 0.113505237438
batch:  600, loss: 0.107505051469
batch:  800, loss: 0.102837853758
batch:  1000, loss: 0.0991723927259
batch:  1200, loss: 0.0966603812389
batch:  1400, loss: 0.094770425284
batch:  1600, loss: 0.0931869727024
batch:  1800, loss: 0.0919440372268
batch:  2000, loss: 0.09094005966
batch:  2200, loss: 0.0900775078989
batch:  2400, loss: 0.0895538578555
batch:  2600, loss: 0.0889533599208
batch:  2800, loss: 0.0884405116298
batch:  3000, loss: 0.0880515721105
batch:  3200, loss: 0.0876767563447
batch:  3400, loss: 0.087328800845

epoch:  1
batch:  200, loss: 0.0816475995071
batch:  400, loss: 0.0817221077718
batch:  600, loss: 0.0814491656981
batch:  800, loss: 0.081532221497
batch:  1000, loss: 0.0815525682978
batch:  1200, loss: 0.0816104552553
batch:  1400, loss: 0.0816084949992
batch:  1600, loss: 0.0814760368527
batch:  1800, loss: 0.0814452114598
batch:  2000, loss: 0.0814621690642
batch:  2200, loss: 0.0

In [67]:
pandas_dataset

0         10.827804
1          8.728285
2         10.333354
3          6.235691
4          7.109641
5          0.000000
6         39.645454
7          3.069092
8          3.836800
9          1.443163
10         8.151371
11         0.000000
12         7.227347
13         7.291297
14         7.812573
15         3.978964
16        16.059092
17        11.954015
18        10.873179
19        11.764853
20         5.794008
21         1.782322
22         6.860672
23        20.722750
24         2.241212
25         8.853338
26         8.894777
27        11.313497
28         0.000000
29         9.279571
            ...    
693981    15.664246
693982     0.000000
693983     3.116730
693984    14.984694
693985    11.677755
693986     4.970895
693987     0.000000
693988     5.095598
693989     0.000000
693990     0.000000
693991     8.907316
693992     4.212761
693993     9.801770
693994     0.000000
693995     6.341951
693996     0.000000
693997     0.000000
693998    13.683038
693999     0.000000


# Prediction

In [75]:
nn.functional.softmax(net(Variable(in_dataset.inputs)))

  if __name__ == '__main__':


Variable containing:
 9.9836e-01  1.6404e-03
 9.9847e-01  1.5276e-03
 9.9871e-01  1.2862e-03
           ⋮            
 9.9165e-01  8.3478e-03
 9.9066e-01  9.3382e-03
 9.8159e-01  1.8407e-02
[torch.FloatTensor of size 694011x2]

In [None]:
input_for_pred = Variable(in_dataset.inputs)
predicted_scores = net(input_for_pred)
predicted_prob = nn.functional.softmax(predicted_scores, dim=2)