In [1]:
import timm
import torch
import torch.optim as optim
import torchvision.utils as vutils
from torchvision import models
import torchvision.transforms as T
import torch.nn as nn
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
import numpy as np
import pandas as pd
import os
from PIL import Image

import math
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report

from catboost import CatBoostClassifier

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [2]:
def imshow(img, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)):
    """Imshow for Tensor."""
    img = img.numpy().transpose((1, 2, 0))
    # img = std * img + mean  # unnormalize
    # img = np.clip(img, 0, 1)  # clip any values outside the range [0, 1]
    plt.imshow(img)
    plt.show()

In [3]:
def plot_images_vs_reconstructed_images(images, reconstructed_imgs):
    # Move images back to cpu for visualization
    images = images.cpu()
    reconstructed_imgs = reconstructed_imgs.cpu()
    
    # Display original and reconstructed images
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 2, 1)
    plt.title('Original Images')
    imshow(vutils.make_grid(images, padding=2, normalize=True))
    
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 2, 2)
    plt.title('Reconstructed Images')
    imshow(vutils.make_grid(reconstructed_imgs, padding=2, normalize=True))

### AutoEncoder Arch

In [4]:
# TO CHANGE
class Encoder(nn.Module):
    def __init__(self, num_input_channels: int, base_channel_size: int, latent_dim: int, act_fn: object = nn.GELU):
        """Encoder.

        Args:
           num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
           base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
           latent_dim : Dimensionality of latent representation z
           act_fn : Activation function used throughout the encoder network
        """
        super().__init__()
        c_hid = base_channel_size
        self.net = nn.Sequential(
            nn.Conv2d(num_input_channels, c_hid, kernel_size=3, padding=1, stride=2),  # 32x32 => 16x16
            act_fn(),
            nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1),
            act_fn(),
            nn.Conv2d(c_hid, 2 * c_hid, kernel_size=3, padding=1, stride=2),  # 16x16 => 8x8
            act_fn(),
            nn.Conv2d(2 * c_hid, 2 * c_hid, kernel_size=3, padding=1),
            act_fn(),
            nn.Conv2d(2 * c_hid, 2 * c_hid, kernel_size=3, padding=1, stride=2),  # 8x8 => 4x4
            act_fn(),
            nn.Flatten(),  # Image grid to single feature vector
            nn.Linear(8192, latent_dim),
        )

    def forward(self, x):
        return self.net(x)

encoder = Encoder(num_input_channels=3, base_channel_size = 64, latent_dim = 2048)
encoder = encoder.to(device)

In [5]:
class Decoder(nn.Module):
    def __init__(self, num_input_channels: int, base_channel_size: int, latent_dim: int, act_fn: object = nn.GELU):
        """Decoder.

        Args:
           num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3
           base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it.
           latent_dim : Dimensionality of latent representation z
           act_fn : Activation function used throughout the decoder network
        """
        super().__init__()
        c_hid = base_channel_size
        self.linear = nn.Sequential(nn.Linear(latent_dim, 8192), act_fn())
        self.net = nn.Sequential(
            nn.ConvTranspose2d(
                2 * c_hid, 2 * c_hid, kernel_size=3, output_padding=1, padding=1, stride=2
            ),  # 4x4 => 8x8
            act_fn(),
            nn.Conv2d(2 * c_hid, 2 * c_hid, kernel_size=3, padding=1),
            act_fn(),
            nn.ConvTranspose2d(2 * c_hid, c_hid, kernel_size=3, output_padding=1, padding=1, stride=2),  # 8x8 => 16x16
            act_fn(),
            nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1),
            act_fn(),
            nn.ConvTranspose2d(
                c_hid, num_input_channels, kernel_size=3, output_padding=1, padding=1, stride=2
            ),  # 16x16 => 32x32
            nn.Tanh(),  # The input images is scaled between -1 and 1, hence the output has to be bounded as well
        )

    def forward(self, x):
        x = self.linear(x)
        x = x.reshape(x.shape[0], 128, 8, 8)
        x = self.net(x)
        return x

decoder = Decoder(num_input_channels=3, base_channel_size = 64, latent_dim = 2048)
decoder = decoder.to(device)

In [6]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Example usage
autoencoder = Autoencoder()
autoencoder = autoencoder.to(device)

In [7]:
# autoencoder_chckpt = torch.load('deep_autoencoder_v1_40kloss.pth')
# print(autoencoder_chckpt.keys())

deep_encoder_checkpoint_epoch_21 fot on the EncoderAndLabels nb, loss on train of 0.0087

In [63]:
encoder.load_state_dict(torch.load(f'deep_encoder_checkpoint_epoch_21.pth')['encoder_state_dict'])
encoder = encoder.to(device)

In [9]:
# encoder.load_state_dict(autoencoder_chckpt['encoder_state_dict'])
# decoder.load_state_dict(autoencoder_chckpt['decoder_state_dict'])

### Data Setup

In [31]:
train_df = pd.read_csv('dataset/train.csv')
valid_df = pd.read_csv('dataset/val.csv')
test_df = pd.read_csv('dataset/test.csv')

print(f"Train: {train_df.shape}; Valid: {valid_df.shape}; Test: {test_df.shape}")

Train: (13000, 2); Valid: (2000, 2); Test: (5000, 1)


In [32]:
train_df.columns, train_df.head()

(Index(['Image', 'Class'], dtype='object'),
                                       Image  Class
 0  0be195e0-eb16-4f29-ac7c-196dec9da47d.png     79
 1  28045419-b3b2-415b-9085-b4d241944235.png     94
 2  b7078f35-d239-4dd6-babb-1af7be1b9364.png     79
 3  0f54f663-2953-432b-bdd4-9b9f7a78bfb9.png     23
 4  ba11dda2-37d7-4d28-8bbb-128d452a171c.png     88)

In [33]:
train_path = 'dataset/train_images/'
val_path = 'dataset/val_images/'
test_path = 'dataset/test_images/'

In [34]:
train_image_path = os.path.join(train_path,train_df['Image'][0])
print(train_image_path)

dataset/train_images/0be195e0-eb16-4f29-ac7c-196dec9da47d.png


In [35]:
image = Image.open(train_image_path)
np.array(image).shape

(64, 64, 3)

In [36]:
train_image_names = train_df['Image'].tolist()
train_image_labels = train_df['Class'].tolist()
# print(train_image_names[:5], train_image_labels[:5])

train_image_paths = [os.path.join(train_path, image_name)for image_name in train_image_names]
print(train_image_paths[:5], train_image_labels[:5])

['dataset/train_images/0be195e0-eb16-4f29-ac7c-196dec9da47d.png', 'dataset/train_images/28045419-b3b2-415b-9085-b4d241944235.png', 'dataset/train_images/b7078f35-d239-4dd6-babb-1af7be1b9364.png', 'dataset/train_images/0f54f663-2953-432b-bdd4-9b9f7a78bfb9.png', 'dataset/train_images/ba11dda2-37d7-4d28-8bbb-128d452a171c.png'] [79, 94, 79, 23, 88]


In [37]:
val_image_names = valid_df['Image'].tolist()
val_image_labels = valid_df['Class'].tolist()

val_image_paths = [os.path.join(val_path, image_name)for image_name in val_image_names]
print(val_image_paths[:5], val_image_labels[:5])

['dataset/val_images/e91a8fbc-d3ba-4b39-8c2f-04c14de78e5e.png', 'dataset/val_images/7c40819b-c3ce-4a91-9e98-c3df11b63623.png', 'dataset/val_images/d54269d7-fe86-4112-9c0f-99cc6ab8d9c0.png', 'dataset/val_images/cbf9ac9e-0859-4b54-ae65-347587b45deb.png', 'dataset/val_images/6aafce3f-9002-44e0-9a99-ffe9b49c9bac.png'] [32, 85, 41, 97, 62]


In [38]:
test_image_names = test_df['Image'].tolist()
test_image_paths = [os.path.join(test_path, image_name)for image_name in test_image_names]
print(test_image_paths[:5])

['dataset/test_images/046f61c4-b825-459a-8b2d-07503f5b94a5.png', 'dataset/test_images/67db001f-e287-4950-ac49-6683b493d1a4.png', 'dataset/test_images/9f1d36a1-f046-4c5d-9e8a-0a3758ff605c.png', 'dataset/test_images/5ffef91a-aaf9-4d0d-a219-83a9f5282361.png', 'dataset/test_images/c00af570-0000-4f8f-a3f2-c37a981bfdb1.png']


In [39]:
basic_transform = T.Compose([
    # T.Resize((64, 64)), # -> all are already 64 * 64
    T.ToTensor(),
    # T.Normalize([0.42835271, 0.40658227, 0.34071648], [0.2144312,  0.21884131, 0.20464434])
    T.Normalize((0.5,), (0.5,)),
])

In [40]:
# Function to process an image and get embedding
def get_embedding(image_path, encoder, flatten=True):
    image = Image.open(image_path)
    image = basic_transform(image).unsqueeze(0).to(device)  # Add batch dimension

    with torch.no_grad():
        embedding = encoder(image)
    
    if flatten:
        # Flatten the embedding
        embedding = torch.flatten(embedding, start_dim=0)
    
    return embedding

In [41]:
embedding = get_embedding(os.path.join(train_path, train_image_names[0]), encoder, flatten=False)
embedding.shape

torch.Size([1, 2048])

### Train, Valid, Test Data Loader

In [42]:
# Example Dataset class
class CustomDataset(Dataset):
    def __init__(self, image_paths, labels, transform):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        image = self.transform(image)
        label = self.labels[idx]
        return image, label

In [43]:
# Example Dataset class
class TestDataset(Dataset):
    def __init__(self, image_paths, transform):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        image = self.transform(image)
        return image

In [44]:
train_dataset = CustomDataset(train_image_paths, train_image_labels, transform = basic_transform)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, pin_memory=True)

In [45]:
val_dataset = CustomDataset(val_image_paths, val_image_labels, transform = basic_transform)
val_data_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

In [46]:
test_dataset = TestDataset(test_image_paths, transform = basic_transform)
test_data_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

### Encoder with Classifier

In [94]:
# class EncoderWithClassifier(nn.Module):
#     def __init__(self, pretrained_encoder, num_classes, act_fn=nn.GELU):
#         super().__init__()
#         self.encoder = pretrained_encoder
#         # Freeze the encoder
#         for param in self.encoder.parameters():
#             param.requires_grad = False
#         latent_dim = self.encoder.net[-1].out_features
#         self.classifier = nn.Linear(latent_dim, num_classes)

#     def forward(self, x):
#         x = self.encoder.net(x)  # Use the pre-trained encoder
#         x = self.classifier(x)  # Classifier layer
#         return x

class ResNetBlock(nn.Module):
    def __init__(self, in_features, out_features):
        super(ResNetBlock, self).__init__()
        self.fc = nn.Linear(in_features, out_features)
        self.relu = nn.ReLU()  # Non-in-place version
        self.shortcut = nn.Sequential()
        if in_features != out_features:
            self.shortcut = nn.Sequential(
                nn.Linear(in_features, out_features),
                nn.BatchNorm1d(out_features)
            )

    def forward(self, x):
        identity = self.shortcut(x)
        out = self.relu(self.fc(x))  # Apply ReLU without in-place operation
        out += identity
        return out


class EncoderWithClassifier(nn.Module):
    def __init__(self, pretrained_encoder, num_classes, act_fn=nn.GELU):
        super().__init__()
        self.encoder = pretrained_encoder
        # Freeze the encoder
        # for param in self.encoder.parameters():
        #     param.requires_grad = False

        latent_dim = self.encoder.net[-1].out_features
        self.act_fn = act_fn()  # Instantiate the activation function

        # Intermediate layers
        self.fc1 = nn.Linear(latent_dim, 1024)
        self.fc2 = ResNetBlock(1024, 512)
        self.fc3 = ResNetBlock(512, 256)
        # Final classifier layer
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.encoder.net(x)  # Use the pre-trained encoder
        x = self.act_fn(self.fc1(x))  # First linear layer with activation
        x = self.fc2(x)  # ResNet block to 512
        x = self.fc3(x)  # ResNet block to 256
        x = self.classifier(x)  # Final classifier layer
        return x



In [95]:
# Initialize the model
encoder_classifier = EncoderWithClassifier(pretrained_encoder=encoder, num_classes=100)
encoder_classifier = encoder_classifier.to(device)

### Train Encoder with Classifier

In [96]:
# Loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(encoder_classifier.classifier.parameters(), lr=0.001)

In [97]:
%%time
# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    encoder_classifier.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = encoder_classifier(inputs)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

    if epoch % 4 == 0 and epoch != 0:
        # Save checkpoint
        checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': encoder_classifier.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
        }
        torch.save(checkpoint, f'encoder_clf_checkpoint_epoch_{epoch+1}.pth')

print('Training complete.')


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [128, 256]], which is output 0 of ReluBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [46]:
encoder_classifier.load_state_dict(torch.load('encoder_clf_checkpoint_epoch_6.pth')['model_state_dict']) # This also ruins it

<All keys matched successfully>

In [51]:
torch.save(encoder.state_dict(), 'encoder_clf_20e_emb_2048_train.pth')

### Validation of Encoder preds

In [57]:
# Set the model to evaluation mode
encoder_classifier.eval()

# Store predictions and true labels
all_predictions = []
all_true_labels = []

# Disable gradient calculations
with torch.no_grad():
    for inputs, labels in val_data_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = encoder_classifier(inputs)

        # Get the predicted classes (logits to predicted class index)
        _, predicted_classes = torch.max(outputs, 1)

        # Store predictions and true labels for later analysis
        all_predictions.extend(predicted_classes.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert lists to numpy arrays for further analysis if needed
all_predictions = np.array(all_predictions)
all_true_labels = np.array(all_true_labels)

In [58]:
# Calculate evaluation metrics, e.g., accuracy
accuracy = np.mean(all_predictions == all_true_labels)
print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.3510


In [59]:
# Generate the confusion matrix
cm = confusion_matrix(all_true_labels, all_predictions)

# Summing the diagonal elements gives the total number of correct predictions
correct_predictions = np.trace(cm)
total_predictions = cm.sum()

print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Number of Incorrect Predictions: {total_predictions - correct_predictions}")

Number of Correct Predictions: 702
Number of Incorrect Predictions: 1298


### Train on Validation

### Predict on Test

### Create submission