In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob

import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from torch.utils.data import Dataset, DataLoader

from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook

In [2]:
from scripts.AutoEncoder import AutoEncoder, AutoEncoderDataset, Encoder, Decoder
from scripts.utils import train_keys, target_keys, ScaleData

In [3]:
train = "/share/rcifdata/jbarr/UKAEAGroupProject/data/train_data_clipped.pkl"
df_train = pd.read_pickle(train)
df_train = df_train[train_keys]
df_train, scaler = ScaleData(df_train)

df_train.describe()

Unnamed: 0,ane,ate,autor,machtor,x,zeff,gammae,q,smag,alpha,ani1,ati0,normni1,ti_te0,lognustar
count,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0,26715960.0
mean,-7.554751000000001e-17,2.325436e-16,-1.969976e-14,-2.478637e-14,-1.175991e-15,-7.503728e-15,4.034983e-14,-3.315205e-15,-1.910605e-15,4.421118e-16,-5.968948e-16,2.790048e-16,2.490396e-15,1.438155e-14,1.09086e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-224.9306,-381.4446,-74.574,-2.143341,-1.921246,-1.297033,-171.0608,-1.27779,-84.74415,-26.7842,-55.09811,-210.9354,-1.799047,-3.488317,-3.060594
25%,-0.03258431,-0.2497779,-0.2199693,-0.5673091,-0.7880002,-0.7826355,0.0001095864,-0.7187233,-0.5207354,-0.314272,-0.1510736,-0.1056034,-0.2791258,-0.1170805,-0.7257788
50%,-0.01714946,-0.1156905,-0.2199693,-0.5673091,0.04251098,-0.1509808,0.0001095864,-0.2630268,-0.276017,-0.2158643,-0.09455876,-0.04192553,-0.2264989,-0.08693051,-0.1606022
75%,0.005464286,0.06980049,0.101768,0.3190794,0.8328257,0.5075416,0.0001095864,0.4315107,0.224457,0.06491163,-0.01128275,0.03915007,-0.06719322,-0.08693051,0.5922162
max,224.9695,358.5519,52.36406,5.67917,1.572247,15.05446,171.0611,24.17821,285.1771,144.0235,61.17973,210.6058,20.36316,17.87184,8.10916


In [4]:
test = "/share/rcifdata/jbarr/UKAEAGroupProject/data/test_data_clipped.pkl"

n = 10_000
df_test = pd.read_pickle(test)
target = df_test['target']

df_test_good = df_test[df_test.target == 1]
df_test_good = df_test_good[train_keys]
df_test_good,_ = ScaleData(df_test_good)

df_test_bad = df_test[df_test.target == 0]
df_test_bad = df_test_bad[train_keys]
df_test_bad,_ = ScaleData(df_test_bad)

df_test_good.describe()

Unnamed: 0,ane,ate,autor,machtor,x,zeff,gammae,q,smag,alpha,ani1,ati0,normni1,ti_te0,lognustar
count,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0,2209628.0
mean,-5.956583000000001e-17,6.122434000000001e-17,-3.13511e-15,-8.355476e-15,2.383218e-16,1.525072e-15,2.131544e-14,-1.902655e-15,-4.907369e-16,-1.569095e-18,8.635701000000001e-17,3.133119e-16,-1.612169e-16,-8.538924e-15,-2.840101e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-13.86475,-6.598887,-18.9441,-2.064727,-1.685797,-1.271646,-26.09861,-1.227295,-2.20655,-4.28326,-15.40094,-7.623715,-1.819762,-3.979516,-3.129206
25%,-0.4041698,-0.5671836,-0.2931734,-0.531599,-0.9629006,-0.7970153,0.04211448,-0.7368617,-0.6529241,-0.4752486,-0.3493425,-0.5474067,-0.269191,-0.1083496,-0.7258671
50%,-0.2000343,-0.2617571,-0.2931734,-0.531599,-0.1392759,-0.154358,0.04211448,-0.2880362,-0.3912937,-0.3346264,-0.1949291,-0.2497517,-0.2121132,-0.1083496,-0.1347984
75%,0.09636549,0.2113943,-0.02571284,0.1527294,0.9358571,0.5055925,0.04211448,0.4403853,0.2807673,0.08103228,0.04739077,0.263712,-0.0961189,-0.1083496,0.5852202
max,13.74234,10.91405,29.23029,5.544833,1.680935,15.30832,13.12432,25.27071,6.818796,18.65422,14.45261,12.8323,20.85188,20.33078,6.972556


## Model 1 - AE trained on inputs that give outputs

In [5]:
path = glob.glob("/share/rcifdata/jbarr/UKAEAGroupProject/logs/AutoEncoder/Run-13/*")[0]

model = AutoEncoder.load_from_checkpoint(path, n_input = 15, batch_size = 2048, epochs = 150, learning_rate = 0.0025)
encoder = model.encoder

### Evaluate on inputs that give outputs

In [6]:
data_good = torch.from_numpy(df_test_good.values).float()
outputs_good = encoder.forward(data_good).detach().numpy()

In [7]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(outputs_good[:n,0], outputs_good[:n,1],outputs_good[:n,2])
fig.show()

<IPython.core.display.Javascript object>

In [8]:
plt.figure()

sc = plt.scatter(outputs_good[:n,0], outputs_good[:n,1], c = outputs_good[:n,2])
plt.xlim(-20,20)
plt.ylim(-50,0)
plt.colorbar(sc)
plt.show()

<IPython.core.display.Javascript object>

### Plot  input and output distributions

In [9]:
AE_output = model.forward(data_good).detach().numpy()
df_ae_output = pd.DataFrame(AE_output, columns = train_keys)
df_ae_output['AE'] = 'Outputs'

df_test_tmp = df_test_good
df_test_tmp['AE'] = 'Inputs'

In [10]:
df_compare = pd.concat([df_ae_output, df_test_tmp], ignore_index=True)
df_compare_sample = df_compare.sample(n)

In [11]:
for i in train_keys:
    plt.figure()
    x_min = df_compare_sample[i].quantile(0.1)
    x_max = df_compare_sample[i].quantile(0.9)
    sns.histplot(data = df_compare_sample, x = i, hue = "AE", binrange = (x_min, x_max), bins = 100);
    plt.xlabel(i)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Evaluate on all inputs

In [12]:
data_bad = torch.from_numpy(df_test_bad.values).float()
outputs_bad = encoder.forward(data_bad).detach().numpy()

In [13]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(outputs_bad[:n,0], outputs_bad[:n,1],outputs_bad[:n,2])
fig.show()

<IPython.core.display.Javascript object>

In [14]:
plt.figure()
#sc = plt.scatter(outputs_test_full[:n,0], outputs_test_full[:n,1], c = outputs_test_full[:n,2])
#plt.colorbar(sc)
plt.scatter(outputs_good[:n,0], outputs_good[:n,1], c = outputs_good[:n,2])
plt.scatter(outputs_bad[:n,0], outputs_bad[:n,1], c = outputs_bad[:n,2])
plt.colorbar()

plt.show()

<IPython.core.display.Javascript object>

In [15]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(outputs_bad[:n,0], outputs_bad[:n,1],outputs_bad[:n,2])
ax.scatter(outputs_good[:n,0], outputs_good[:n,1],outputs_good[:n,2])
fig.show()



<IPython.core.display.Javascript object>

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [17]:
df_1 = pd.DataFrame(data = outputs_good)
df_1['target'] = 1

df_2 = pd.DataFrame(data = outputs_bad)
df_2['target'] = 0

df = pd.concat([df_1, df_2])

class_train, class_test = train_test_split(df)
class_train

Unnamed: 0,0,1,2,target
1655088,-1.268947,-2.252769,1.043522,1
335185,-1.246097,-4.832341,3.766870,1
1819481,1.861107,-3.799129,2.423420,1
1019055,1.730511,-3.416982,2.190469,0
888010,-0.530970,-4.884676,2.361863,1
...,...,...,...,...
1244780,-0.500446,-3.146528,2.154030,1
607971,-0.298745,-5.009323,2.425823,0
1787833,0.665468,-4.276926,2.387239,1
324796,0.623002,-5.381156,2.700333,0


In [18]:
#clf = LogisticRegression(C = 10, class_weight = 'balanced')
clf = RandomForestClassifier(n_estimators = 50, class_weight = 'balanced')
clf.fit(class_train[[0, 1, 2]], class_train['target'])

RandomForestClassifier(class_weight='balanced', n_estimators=50)

In [19]:
preds = clf.predict(class_test[[0,1,2]])
tn, fp, fn, tp = confusion_matrix(class_test['target'], preds).ravel()

tpr = tp / (tp + fn)
tnr = tn / (tn + fp)

In [20]:
print(tpr, tnr)

0.8878666847801476 0.7416999352897288


In [21]:
confusion_matrix(class_test['target'], preds, normalize = 'true')

array([[0.74169994, 0.25830006],
       [0.11213332, 0.88786668]])

In [22]:
print((tp + tn)/ (tp + tn + fp + fn))

0.838355248815989
