In [43]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Load Data

In [44]:
# File with data must be in the same directory as notebook
raw_data = pd.read_csv("apple_quality.csv")

# Remove last row with signature
raw_data.drop(raw_data.index[-1], inplace=True)
raw_data.drop(['A_id'], axis=1, inplace=True)

# Display data sample
raw_data.head()

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.491590483,good
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809367,good
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723217,good
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good


Show data info

In [45]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Size         4000 non-null   float64
 1   Weight       4000 non-null   float64
 2   Sweetness    4000 non-null   float64
 3   Crunchiness  4000 non-null   float64
 4   Juiciness    4000 non-null   float64
 5   Ripeness     4000 non-null   float64
 6   Acidity      4000 non-null   object 
 7   Quality      4000 non-null   object 
dtypes: float64(6), object(2)
memory usage: 250.1+ KB


Display datatypes

In [46]:
raw_data.dtypes

Size           float64
Weight         float64
Sweetness      float64
Crunchiness    float64
Juiciness      float64
Ripeness       float64
Acidity         object
Quality         object
dtype: object

Display number of unique values

In [47]:
raw_data.nunique()

Size           4000
Weight         4000
Sweetness      4000
Crunchiness    4000
Juiciness      4000
Ripeness       4000
Acidity        4000
Quality           2
dtype: int64

Use encoder to convert data into numeric values

In [48]:
from sklearn.preprocessing import LabelEncoder

# Use encoder to convert text data into numbers
label_encoder = LabelEncoder()
raw_data['Quality'] = label_encoder.fit_transform(raw_data['Quality'])

# Display transformed data head
raw_data.head()

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.491590483,1
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809367,1
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,0
3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723217,1
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,1


Find number of columns and rows

In [49]:
raw_data.shape

(4000, 8)

# Create PyTorch neural network

In [50]:
# Create network
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(7, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return F.sigmoid(self.layer3(x))
    
# Select correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create an instance of the model
model = NeuralNetwork().to(device)

# Define loss function and optimizer
criterion = nn.BCELoss()    # This criterion support only sigmoid output
optimizer = optim.AdamW(model.parameters(), lr=1e-4)


# Prepare Data for learning

Select X and Y (data snd labels)

In [51]:
# Extract the first 7 columns as X
X = raw_data.iloc[:, :7].values.astype('float32')

# Extract the 8th column as Y
Y = raw_data.iloc[:, 7].values.astype('float32')



Data preprocessing

In [52]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Feed data into scaler
X = scaler.fit_transform(X)

Convert to tensor

In [53]:
# Convert it into tensors
X = torch.tensor(X, dtype=torch.float32, device=device)
Y = torch.tensor(Y, dtype=torch.float32, device=device)

# Split data into training and testing sets

In [54]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_training, X_testing, y_training, y_testing = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# Train the model

In [55]:
# Train the model
num_epochs = 10000
for epoch in range(num_epochs):

    # Forward pass
    outputs = model(X_training)
    loss = criterion(outputs, y_training.view(-1, 1))

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 1000 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1000/10000], Loss: 0.2376
Epoch [2000/10000], Loss: 0.1559
Epoch [3000/10000], Loss: 0.0901
Epoch [4000/10000], Loss: 0.0446
Epoch [5000/10000], Loss: 0.0178
Epoch [6000/10000], Loss: 0.0070
Epoch [7000/10000], Loss: 0.0030
Epoch [8000/10000], Loss: 0.0014
Epoch [9000/10000], Loss: 0.0007
Epoch [10000/10000], Loss: 0.0003


# Evaluate the model

In [56]:
# source data without backpropagation
with torch.no_grad():
    outputs = model(X_testing)
    predicted = (outputs >= 0.5).float()
    accuracy = (predicted == y_testing.view(-1, 1)).float().mean()
    print(f'Accuracy on test data: {accuracy.item():.4f}')


Accuracy on test data: 0.9337
