In [1]:
import os 
import torch 
import sklearn.model_selection
import pandas as pd 
import numpy as np

In [3]:
os.listdir()

['.ipynb_checkpoints',
 'data.csv',
 'dataclass.py',
 'func_utils.py',
 'main.ipynb',
 'main.py',
 'network.py',
 '__pycache__']

In [4]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
data.isna().sum().sum()

0

In [6]:
data.quality.value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [7]:
target_labels = data.quality.unique() 
target_labels = np.sort(target_labels)
target_new_labels = np.argsort(target_labels)
print(target_labels,target_new_labels)
target_map = {i:j for i,j in zip(target_labels,target_new_labels)}
print(target_map)

[3 4 5 6 7 8] [0 1 2 3 4 5]
{3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5}


In [8]:
# modifying the target values
data.quality = data.quality.replace(target_map)

In [9]:
# checking the value counts again 
data.quality.value_counts()

2    681
3    638
4    199
1     53
5     18
0     10
Name: quality, dtype: int64

In [10]:
X = data.iloc[:,0:-1]
y = data.iloc[:,-1]
X.shape,y.shape

((1599, 11), (1599,))

In [11]:
X_train,X_test_,y_train,y_test_ = sklearn.model_selection.train_test_split(
    X,y,stratify = y, random_state = 1,test_size = 0.3
)
X_test,X_val,y_test,y_val = sklearn.model_selection.train_test_split(
    X_test_,y_test_,stratify = y_test_,random_state = 1, test_size = 0.1
)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
print(X_val.shape,y_val.shape)

(1119, 11) (1119,)
(432, 11) (432,)
(48, 11) (48,)


In [12]:
scaler = sklearn.preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

y_train, y_test, y_val =  np.array(y_train),np.array(y_test),np.array(y_val)

In [13]:
epochs = 50 
batch_size = 64
learning_rate = 0.001
class ClassifierDataset(torch.utils.data.Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_dataset = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
val_dataset = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())
test_dataset = ClassifierDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).long())

In [14]:
target_list = []
for _, t in train_dataset:
    target_list.append(t)
    
target_list = torch.tensor(target_list)

In [15]:
def get_class_distribution(obj):
    count_dict = {
        "rating_3": 0,
        "rating_4": 0,
        "rating_5": 0,
        "rating_6": 0,
        "rating_7": 0,
        "rating_8": 0,
    }
    
    for i in obj:
        if i == 0: 
            count_dict['rating_3'] += 1
        elif i == 1: 
            count_dict['rating_4'] += 1
        elif i == 2: 
            count_dict['rating_5'] += 1
        elif i == 3: 
            count_dict['rating_6'] += 1
        elif i == 4: 
            count_dict['rating_7'] += 1  
        elif i == 5: 
            count_dict['rating_8'] += 1              
        else:
            print("Check classes.")
            
    return count_dict

In [16]:
class_count = [i for i in get_class_distribution(y_train).values()]
class_weights = 1./torch.tensor(class_count, dtype=torch.float) 
print(class_weights)

tensor([0.1429, 0.0270, 0.0021, 0.0022, 0.0072, 0.0769])


In [17]:
class_weights_all = class_weights[target_list]
class_weights_all

tensor([0.0270, 0.0072, 0.0072,  ..., 0.0021, 0.0072, 0.0021])

In [18]:
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
weighted_sampler = WeightedRandomSampler(
    weights=class_weights_all,
    num_samples=len(class_weights_all),
    replacement=True
)

In [19]:
EPOCHS = 300
BATCH_SIZE = 16
LEARNING_RATE = 0.0007
NUM_FEATURES = len(X.columns)
NUM_CLASSES = 6

In [20]:
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    sampler=weighted_sampler
)
val_loader = DataLoader(dataset=val_dataset, batch_size=1)
test_loader = DataLoader(dataset=test_dataset, batch_size=1)

In [21]:
import torch.nn as nn
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 512)
        self.layer_2 = nn.Linear(512, 128)
        self.layer_3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x

In [22]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [23]:
model = MulticlassClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
model.to(device)
import torch.optim as optim

criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(model)

MulticlassClassification(
  (layer_1): Linear(in_features=11, out_features=512, bias=True)
  (layer_2): Linear(in_features=512, out_features=128, bias=True)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=6, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (batchnorm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [24]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    return acc

In [25]:
# for keeping the history 
accuracy_stats = {'train': [],"val": []}
loss_stats = {'train': [],"val": []}

In [26]:
print("Begin training.")
EPOCHS = 100
for e in (range(1, EPOCHS+1)):
    
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0
    model.train()
    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        
        y_train_pred = model(X_train_batch)
        
        train_loss = criterion(y_train_pred, y_train_batch)
        train_acc = multi_acc(y_train_pred, y_train_batch)
        
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()
        
        
    # VALIDATION    
    with torch.no_grad():
        
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            y_val_pred = model(X_val_batch)
                        
            val_loss = criterion(y_val_pred, y_val_batch)
            val_acc = multi_acc(y_val_pred, y_val_batch)
            
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()

    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
                              
    
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')

Begin training.
Epoch 001: | Train Loss: 0.92841 | Val Loss: 1.96295 | Train Acc: 36.857| Val Acc: 6.250
Epoch 002: | Train Loss: 0.41085 | Val Loss: 2.26635 | Train Acc: 46.300| Val Acc: 8.333
Epoch 003: | Train Loss: 0.33614 | Val Loss: 2.42978 | Train Acc: 47.100| Val Acc: 6.250
Epoch 004: | Train Loss: 0.30399 | Val Loss: 2.45807 | Train Acc: 47.271| Val Acc: 6.250
Epoch 005: | Train Loss: 0.31633 | Val Loss: 2.55948 | Train Acc: 46.471| Val Acc: 6.250
Epoch 006: | Train Loss: 0.32424 | Val Loss: 2.60250 | Train Acc: 46.471| Val Acc: 8.333
Epoch 007: | Train Loss: 0.24279 | Val Loss: 2.49559 | Train Acc: 50.600| Val Acc: 6.250
Epoch 008: | Train Loss: 0.27089 | Val Loss: 2.55101 | Train Acc: 51.129| Val Acc: 8.333
Epoch 009: | Train Loss: 0.22356 | Val Loss: 2.50222 | Train Acc: 54.757| Val Acc: 8.333
Epoch 010: | Train Loss: 0.22766 | Val Loss: 2.64128 | Train Acc: 52.757| Val Acc: 8.333
Epoch 011: | Train Loss: 0.22761 | Val Loss: 2.44995 | Train Acc: 54.357| Val Acc: 8.333
Epoch

In [None]:
# Create dataframes
from matplotlib import pyplot as plt
import seaborn as sns
train_val_acc_df = pd.DataFrame.from_dict(accuracy_stats).reset_index().melt(id_vars=['index']).rename(columns={"index":"epochs"})
train_val_loss_df = pd.DataFrame.from_dict(loss_stats).reset_index().melt(id_vars=['index']).rename(columns={"index":"epochs"})
# Plot the dataframes
# fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,7)) # ax = axes[0],axes=[1]
sns.lineplot(data=train_val_acc_df, x = "epochs", y="value", hue="variable").set_title('Train-Val Accuracy/Epoch')
plt.show()
sns.lineplot(data=train_val_loss_df, x = "epochs", y="value", hue="variable").set_title('Train-Val Loss/Epoch')
plt.show()

In [None]:
y_pred_list = []
with torch.no_grad():
    model.eval()
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
class2idx = {3:0,4:1,5:2,6:3,7:4,8:5}
idx2class = {v: k for k, v in class2idx.items()}
data['quality'].replace(class2idx, inplace=True)
confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred_list)).rename(columns=idx2class, index=idx2class)
sns.heatmap(confusion_matrix_df, annot=True,cmap = 'twilight_shifted_r')
plt.show()

In [None]:
print(classification_report(y_test, y_pred_list))