# Data Loading

### Loading the Exoplanet training data

The exoplanet training data has already been separated into X_train, X_test, y_train, y_test. 

We will use it for the training


In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
X_train = np.loadtxt('middle_data/X_train.txt', delimiter=' ')
X_test  = np.loadtxt('middle_data/X_test.txt', delimiter=' ')

Y_train = np.loadtxt('middle_data/Y_train.txt', delimiter=' ')
Y_test  = np.loadtxt('middle_data/Y_test.txt', delimiter=' ')

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")

X_train shape: (2000, 37)
X_test shape: (2000, 37)
Y_train shape: (2000,)
Y_test shape: (2000,)


In [6]:
print(type(X_train))

<class 'numpy.ndarray'>


### 1. Data pre-processing 

In this step, we will perform normalization of the X dataset, and label y in binary terms

In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

# Dimensionality reduction
pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# Normalize data to the range [-1, 1] (use the training set to fit the scaler)
scaler = MinMaxScaler(feature_range=(-1, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Ensure labels are binary (if not already binary)
Y_train = (Y_train > np.median(Y_train)).astype(int)
Y_test = (Y_test > np.median(Y_test)).astype(int)

print(f"X_train shape after selection: {X_train.shape}")
print(f"X_test shape after selection: {X_test.shape}")

X_train shape after selection: (2000, 2)
X_test shape after selection: (2000, 2)


In [16]:
Y_train.shape

(2000,)

# Building MLP model

In [10]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Hyperparameters
batch_size = 32
learning_rate = 0.001
epochs = 10
num_classes = 2

In [21]:
# Building dataset 
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
train_dataset = torch.utils.data.TensorDataset(X_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)



In [None]:
# Creating MLP Model
class MLPModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MLPModel, self).__init__()
        self.linear1 = nn.Linear(input_size, 512)
        self.linear2 = nn.Linear(512, 512)
        self.linear3 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        return x


In [None]:
# Creating Training Loop
# Referenced from: https://pytorch.org/tutorials/beginner/introyt/trainingyt.html
def train_one_epoch(model, criterion, optimizer, trainloader):
    running_loss = 0.0

    for inputs, labels in tqdm(trainloader):
        optimizer.zero_grad()
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(trainloader)
    return train_loss

def evaluate_loss(model, criterion, testloader, device):
    test_loss = 0.0
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
    avg_loss = test_loss / len(testloader)
    return avg_loss

def evaluate_accuracy(model, testloader, device):
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in testloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = (correct / total) * 100
    return accuracy

def train(model, optimizer, criterion, trainloader, testloader, epochs, device):
    train_losses = []
    test_losses = []
    train_accuracies = []
    test_accuracies = []
    best_loss = float('inf')

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(torch.device(device))

    for epoch in range(epochs):
        print(f'EPOCH {epoch+1}/{epochs}:', end=" ")

        model.train()

        train_loss = train_one_epoch(model, criterion, optimizer, trainloader)
        train_losses.append(train_loss)
        print(f"Train Loss: {train_loss}")

        model.eval()

        train_accuracy = evaluate_accuracy(model, trainloader, device)
        train_accuracies.append(train_accuracy)
        print(f"Test Accuracy: {train_accuracy:.2f}%")

        test_loss = evaluate_loss(model, criterion, testloader, device)
        test_losses.append(test_loss)
        print(f"Test Loss: {test_loss}")

        test_accuracy = evaluate_accuracy(model, testloader, device)
        test_accuracies.append(test_accuracy)
        print(f"Test Accuracy: {test_accuracy:.2f}%")

        # Save best model
        if test_loss < best_loss:
          best_loss = test_loss
          model_path = 'mlp_model_{}'.format(epoch)
          torch.save(model.state_dict(), model_path)

    return train_losses, test_losses, train_accuracies, test_accuracies


In [None]:
# Training MLP Model
mlp_model = MLPModel(input_size, num_classes)

# Print the model architecture
print(mlp_model)

# Define optimizers for the models
optimizer = torch.optim.SGD(mlp_model.parameters(), lr=learning_rate)

criterion = torch.nn.CrossEntropyLoss()
# Train the NLP model
train_losses, test_losses, train_accuracies, test_accuracies = train(mlp_model, optimizer, criterion, trainloader, testloader, epochs, device)
torch.save(mlp_model.state_dict(), 'mlp_model_final_weights.pth')
torch.save((train_losses, test_losses), 'mlp_model_final_losses.pt')

In [None]:
# Evaluating MLP Model
## Plot loss graph
plt.figure(figsize=(6, 4))
plt.plot(train_losses, label="Train")
plt.plot(test_losses, label="Test")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title(f"MLP Model - Train and Test Losses")
plt.legend()
plt.tight_layout()
plt.grid()
plt.show()



In [None]:
## Plot accuracy graph
plt.figure(figsize=(6, 4))
plt.plot(train_accuracies, label="Train")
plt.plot(test_accuracies, label="Test")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title(f"MLP Model - Train and Test Accuracies")
plt.legend()
plt.tight_layout()
plt.grid()
plt.show()