Now go on to classify the real majorana data using neural network!

In [1]:
import numpy as np
from matplotlib import pyplot as plt
from scipy.ndimage import gaussian_filter

good_data_small = np.load('datasets/True_majsts_data_small.npy')
good_data_large = np.load('datasets/True_majsts_data_large.npy')
bad_data_small = np.load('datasets/Andreev_majsts_data_small.npy')
bad_data_large = np.load('datasets/Andreev_majsts_data_large.npy')
ugly_data_small = np.load('datasets/Ugly_majsts_data_small.npy')
ugly_data_large = np.load('datasets/Ugly_majsts_data_large.npy')

good_data = np.concatenate([good_data_small, good_data_large], axis=0)
bad_data = np.concatenate([bad_data_small, bad_data_large], axis=0)
ugly_data = np.concatenate([ugly_data_small, ugly_data_large], axis=0)

N1, N2, N3 = len(good_data), len(bad_data), len(ugly_data)
data = np.concatenate([good_data, bad_data, ugly_data], axis=0)

N = len(data)
# we can keep the order, because scikit can shuffle them later
labels = np.concatenate([np.ones(N1), np.zeros(N2), np.zeros(N3)])
np.shape(data)

(12000, 28, 28)

To make the machine learning more confusing (Adversarial attack), we can add Gaussain noise smearing.

In [2]:
# sigma=3, ampl=0.1
for i in range(N):
    data[i] += 0.2 * np.random.normal(size=(28, 28))
    data[i] = gaussian_filter(data[i], sigma=2)
data = data.reshape((N, -1))

We can use **Anomaly Detection** method to eliminate some unrealistic data out of the training dataset.

For example, we use Isolation Forest to post process.

In [3]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.1, random_state=42)
clf.fit(data)

# filter
predictions = clf.predict(data)
data = data[predictions == 1]
labels = labels[predictions == 1]
data = data.reshape((-1, 28, 28))

Now generate test & training set, and feed this into the Convolutional Neural Network.

In [4]:
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader

# Splitting data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.25, random_state=42)

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample_data = torch.tensor(self.data[idx], dtype=torch.float32).unsqueeze(0)  # Adding channel dimension
        sample_label = torch.tensor(self.labels[idx], dtype=torch.long)
        return sample_data, sample_label

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)  # 1 input channel, 32 output channels, 3x3 kernel
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.fc1 = nn.Linear(64*5*5, 128)   # The size is reduced to 5x5 after pooling
        self.fc2 = nn.Linear(128, 2)        # Binary classification

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


Start training.

In [5]:
# Hyperparameters
learning_rate = 0.001
num_epochs = 100
batch_size = 64

# Dataloaders
train_dataset = CustomDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_data, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model, loss and optimizer
model = SimpleCNN()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for data, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

  sample_label = torch.tensor(self.labels[idx], dtype=torch.long)


Epoch [1/100], Loss: 0.4152
Epoch [2/100], Loss: 0.3235
Epoch [3/100], Loss: 0.2392
Epoch [4/100], Loss: 0.2042
Epoch [5/100], Loss: 0.2280
Epoch [6/100], Loss: 0.3708
Epoch [7/100], Loss: 0.1028
Epoch [8/100], Loss: 0.2131
Epoch [9/100], Loss: 0.1448
Epoch [10/100], Loss: 0.1192
Epoch [11/100], Loss: 0.2478
Epoch [12/100], Loss: 0.1527
Epoch [13/100], Loss: 0.2199
Epoch [14/100], Loss: 0.0782
Epoch [15/100], Loss: 0.2554
Epoch [16/100], Loss: 0.1824
Epoch [17/100], Loss: 0.3021
Epoch [18/100], Loss: 0.1076
Epoch [19/100], Loss: 0.1164
Epoch [20/100], Loss: 0.0518
Epoch [21/100], Loss: 0.1523
Epoch [22/100], Loss: 0.0959
Epoch [23/100], Loss: 0.1272
Epoch [24/100], Loss: 0.0870
Epoch [25/100], Loss: 0.1961
Epoch [26/100], Loss: 0.1237
Epoch [27/100], Loss: 0.1667
Epoch [28/100], Loss: 0.0739
Epoch [29/100], Loss: 0.1969
Epoch [30/100], Loss: 0.0530
Epoch [31/100], Loss: 0.0802
Epoch [32/100], Loss: 0.1964
Epoch [33/100], Loss: 0.0171
Epoch [34/100], Loss: 0.0884
Epoch [35/100], Loss: 0

Test the accuracy.

In [6]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data, labels in test_loader:
        outputs = model(data)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total}%")

  sample_label = torch.tensor(self.labels[idx], dtype=torch.long)


Accuracy: 95.14814814814815%


Deep learning wins!