Converting CSV to PyTorch tensor: https://www.codegenes.net/blog/pytorch-dataset-from-csv/

In [154]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [155]:
path = r'/workspaces/ACT-coursework-2/star_classification.csv'
df_sdss = pd.read_csv(path)
df_sdss['class'] = LabelEncoder().fit_transform(df_sdss['class'])  #changes data from string to integer variables
df_sdss.sample(10)

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
27815,1.237679e+18,30.918809,4.13628,21.99281,21.40838,21.15975,20.93182,20.92116,7727,301,3,272,9.836065e+18,1,2.098777,8736,57400,739
75509,1.237654e+18,186.387998,64.216099,20.4981,19.45201,19.03974,18.92427,18.50794,2078,301,2,107,6.756128e+17,1,0.800508,600,52317,265
22940,1.237662e+18,239.253537,6.287921,18.05269,17.10405,16.80945,16.69879,16.67672,3894,301,1,238,2.051524e+18,2,0.000213,1822,53172,488
59108,1.237662e+18,227.023436,32.527047,22.1235,19.75591,18.28602,16.7713,15.99146,3900,301,5,604,3.30467e+18,2,-0.000335,2935,54652,559
30418,1.237679e+18,19.966213,4.396557,21.79158,22.52559,21.68066,21.5827,20.54644,7718,301,2,122,1.062195e+19,0,0.940256,9434,57712,750
43404,1.237665e+18,195.245179,35.437614,20.87053,19.05368,17.67823,17.16317,16.83086,4576,301,6,504,2.28672e+18,0,0.220472,2031,53848,62
11091,1.237668e+18,218.280662,14.227425,19.72429,17.77691,16.81737,16.34801,15.96386,5322,301,2,59,3.09406e+18,0,0.078652,2748,54234,316
40941,1.237679e+18,26.402946,2.34269,22.02573,21.71187,21.45149,21.38046,21.43656,7717,301,4,506,8.826076e+18,1,2.199759,7839,56900,532
35065,1.23767e+18,46.45069,37.019152,19.40372,18.22283,17.86272,17.73182,17.6923,5817,301,2,213,2.748348e+18,2,-0.00028,2441,54065,97
17194,1.237665e+18,159.229079,30.989822,23.65771,21.92722,20.00496,19.40892,18.98498,4576,301,1,302,1.281741e+19,0,0.358106,11384,58522,610


In [156]:
features_used = ['u', 'g', 'r', 'i', 'z']

In [157]:
features = df_sdss.drop('class', axis=1).values
target = df_sdss['class'].values

scaler = StandardScaler()

features_scaled = scaler.fit_transform(features)

In [158]:
batch_size = 164
num_epochs = 25
learning_rate = 0.001

In [159]:
features_train, features_test, target_train, target_test = train_test_split(features_scaled, target, test_size = 0.3, random_state = 42)

features_train_tensor = torch.tensor(features_train, dtype = torch.float32)
target_train_tensor = torch.tensor(target_train, dtype = torch.long)
features_test_tensor = torch.tensor(features_test, dtype = torch.float32)
target_test_tensor = torch.tensor(target_test, dtype = torch.long)

train_set = TensorDataset(features_train_tensor, target_train_tensor)
test_set = TensorDataset(features_test_tensor, target_test_tensor)


In [160]:
train_dataloader = DataLoader(
    train_set, batch_size=batch_size, shuffle=True)

test_dataloader = DataLoader(
    test_set, batch_size=batch_size, shuffle=False)

In [161]:
class SDSSNN(nn.Module):
    def __init__(self,):
        super(SDSSNN, self).__init__()
        self.fc1 = nn.Linear(17, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 3)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        return x

model = SDSSNN()
print(model)

SDSSNN(
  (fc1): Linear(in_features=17, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=3, bias=True)
  (relu): ReLU()
)


In [162]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [163]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, 1)
    return torch.sum(preds == labels).item() / len(labels)

def train(model, train_dataloader, criterion, optimizer, epoch):
    model.train()
    running_acc = 0.0
    running_loss = 0.0
    for n,(inputs, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_acc += accuracy(outputs,labels)

        if (n+1) % 100 ==0:
            print(f'Epoch {epoch}, Batch{n+1}, Loss: {running_loss / 100:.4f}, Accuracy: {running_acc / 100:.4f}')
            running_loss = 0.0
            running_acc = 0.0


def test(model, test_dataloader, criterion):
    model.eval()
    test_loss = 0.0
    test_acc = 0.0
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            test_acc += accuracy(outputs, labels)
    print(f'test loss: {test_loss/len(test_dataloader):.4f}, Test accuracy: {test_acc / len(test_dataloader):.4f}')

In [164]:
for epoch in range(1, num_epochs + 1):
    train(model, train_dataloader, criterion, optimizer, epoch)
    test(model, test_dataloader, criterion)

Epoch 1, Batch100, Loss: 0.6330, Accuracy: 0.7296
Epoch 1, Batch200, Loss: 0.2517, Accuracy: 0.9201
Epoch 1, Batch300, Loss: 0.2011, Accuracy: 0.9382
Epoch 1, Batch400, Loss: 0.1811, Accuracy: 0.9431
test loss: 0.2529, Test accuracy: 0.9479
Epoch 2, Batch100, Loss: 0.1716, Accuracy: 0.9456
Epoch 2, Batch200, Loss: 0.1472, Accuracy: 0.9538
Epoch 2, Batch300, Loss: 0.1412, Accuracy: 0.9563
Epoch 2, Batch400, Loss: 0.1456, Accuracy: 0.9552
test loss: 0.2881, Test accuracy: 0.9484
Epoch 3, Batch100, Loss: 0.1442, Accuracy: 0.9550
Epoch 3, Batch200, Loss: 0.1262, Accuracy: 0.9607
Epoch 3, Batch300, Loss: 0.1304, Accuracy: 0.9595
Epoch 3, Batch400, Loss: 0.1290, Accuracy: 0.9597
test loss: 0.2802, Test accuracy: 0.9626
Epoch 4, Batch100, Loss: 0.1263, Accuracy: 0.9620
Epoch 4, Batch200, Loss: 0.1276, Accuracy: 0.9600
Epoch 4, Batch300, Loss: 0.1224, Accuracy: 0.9612
Epoch 4, Batch400, Loss: 0.1277, Accuracy: 0.9610
test loss: 0.2906, Test accuracy: 0.9649
Epoch 5, Batch100, Loss: 0.1201, Acc