In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA

# pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Dataset
import copy
from tqdm import tqdm
import os
import pickle

# Ensure the device is GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Extract and save features

### D1 to D10

In [2]:
class FeatureExtractor(nn.Module):
    def __init__(self, num_components=128):
        super(FeatureExtractor, self).__init__()
        # Use ResNet18 as backbone
        resnet = models.resnet152(pretrained=True)
        # Remove the final fully connected layer
        self.features = nn.Sequential(*list(resnet.children())[:-1])

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)  # Flatten to get 2048-dim feature vector
        return x

def extract_and_save_features(base_path, save_dir):
    """
    Extract features from all datasets and save them
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize feature extractor
    feature_extractor = FeatureExtractor().to(device)
    feature_extractor.eval()

    # Define transforms
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225])
    ])

    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Process training datasets (D1 to D20)
    for i in range(1, 11):
        print(f"Processing dataset D{i}...")

        # Load dataset
        data = torch.load(f"{base_path}/train_data/{i}_train_data.tar.pth")
        images = data['data']  # Shape: [N, 32, 32, 3]

        # Get labels if available (only for D1)
        labels = data.get('targets', None)

        # Process images in batches
        batch_size = 64
        features_list = []

        with torch.no_grad():
            for j in tqdm(range(0, len(images), batch_size)):
                batch_images = images[j:j + batch_size]
                # Convert to torch tensor and process
                batch_tensors = torch.stack([
                    transform(img) for img in batch_images
                ]).to(device)

                # Extract features
                batch_features = feature_extractor(batch_tensors)
                features_list.append(batch_features.cpu().numpy())

        # Combine all features
        features = np.concatenate(features_list, axis=0)

        # Save features and labels
        save_dict = {
            'features': features,
            'labels': labels
        }
        with open(f"{save_dir}/{i}_train_features.pkl", 'wb') as f:
            pickle.dump(save_dict, f)

        # Process corresponding test dataset
        print(f"Processing test dataset D{i}_test...")
        test_data = torch.load(f"{base_path}/eval_data/{i}_eval_data.tar.pth")
        test_images = test_data['data']
        test_labels = test_data['targets']

        test_features_list = []
        with torch.no_grad():
            for j in tqdm(range(0, len(test_images), batch_size)):
                batch_images = test_images[j:j + batch_size]
                batch_tensors = torch.stack([
                    transform(img) for img in batch_images
                ]).to(device)
                batch_features = feature_extractor(batch_tensors)
                test_features_list.append(batch_features.cpu().numpy())

        test_features = np.concatenate(test_features_list, axis=0)

        # Save test features and labels
        test_save_dict = {
            'features': test_features,
            'labels': test_labels
        }
        with open(f"{save_dir}/{i}_test_features.pkl", 'wb') as f:
            pickle.dump(test_save_dict, f)

In [3]:
if __name__ == "__main__":
    # Step 1: Extract and save features
    base_path = "../dataset/part_one_dataset"
    features_dir = "features"

    print("Phase 1: Extracting features...")
    extract_and_save_features(base_path, features_dir)

Phase 1: Extracting features...


  data = torch.load(f"{base_path}/train_data/{i}_train_data.tar.pth")


Processing dataset D1...


100%|██████████| 40/40 [00:22<00:00,  1.77it/s]
  test_data = torch.load(f"{base_path}/eval_data/{i}_eval_data.tar.pth")


Processing test dataset D1_test...


100%|██████████| 40/40 [00:22<00:00,  1.75it/s]


Processing dataset D2...


100%|██████████| 40/40 [00:22<00:00,  1.75it/s]


Processing test dataset D2_test...


100%|██████████| 40/40 [00:22<00:00,  1.74it/s]


Processing dataset D3...


100%|██████████| 40/40 [00:23<00:00,  1.74it/s]


Processing test dataset D3_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing dataset D4...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing test dataset D4_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing dataset D5...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing test dataset D5_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing dataset D6...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing test dataset D6_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing dataset D7...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing test dataset D7_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing dataset D8...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing test dataset D8_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing dataset D9...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing test dataset D9_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing dataset D10...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing test dataset D10_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


### D11 to D20

In [4]:
class FeatureExtractor(nn.Module):
    def __init__(self, num_components=128):
        super(FeatureExtractor, self).__init__()
        # Use ResNet18 as backbone
        resnet = models.resnet152(pretrained=True)
        # Remove the final fully connected layer
        self.features = nn.Sequential(*list(resnet.children())[:-1])

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)  # Flatten to get 2048-dim feature vector
        return x

def extract_and_save_features(base_path, save_dir):
    """
    Extract features from all datasets and save them
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize feature extractor
    feature_extractor = FeatureExtractor().to(device)
    feature_extractor.eval()

    # Define transforms
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225])
    ])

    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Process training datasets (D1 to D20)
    for i in range(1, 11):
        print(f"Processing dataset D{i+10}...")

        # Load dataset
        data = torch.load(f"{base_path}/train_data/{i}_train_data.tar.pth")
        images = data['data']  # Shape: [N, 32, 32, 3]

        # Get labels if available (only for D1)
        labels = data.get('targets', None)

        # Process images in batches
        batch_size = 64
        features_list = []

        with torch.no_grad():
            for j in tqdm(range(0, len(images), batch_size)):
                batch_images = images[j:j + batch_size]
                # Convert to torch tensor and process
                batch_tensors = torch.stack([
                    transform(img) for img in batch_images
                ]).to(device)

                # Extract features
                batch_features = feature_extractor(batch_tensors)
                features_list.append(batch_features.cpu().numpy())

        # Combine all features
        features = np.concatenate(features_list, axis=0)

        # Save features and labels
        save_dict = {
            'features': features,
            'labels': labels
        }
        with open(f"{save_dir}/{i+10}_train_features.pkl", 'wb') as f:
            pickle.dump(save_dict, f)

        # Process corresponding test dataset
        print(f"Processing test dataset D{i+10}_test...")
        test_data = torch.load(f"{base_path}/eval_data/{i}_eval_data.tar.pth")
        test_images = test_data['data']
        test_labels = test_data['targets']

        test_features_list = []
        with torch.no_grad():
            for j in tqdm(range(0, len(test_images), batch_size)):
                batch_images = test_images[j:j + batch_size]
                batch_tensors = torch.stack([
                    transform(img) for img in batch_images
                ]).to(device)
                batch_features = feature_extractor(batch_tensors)
                test_features_list.append(batch_features.cpu().numpy())

        test_features = np.concatenate(test_features_list, axis=0)

        # Save test features and labels
        test_save_dict = {
            'features': test_features,
            'labels': test_labels
        }
        with open(f"{save_dir}/{i+10}_test_features.pkl", 'wb') as f:
            pickle.dump(test_save_dict, f)

In [5]:
if __name__ == "__main__":
    # Step 1: Extract and save features
    base_path = "../dataset/part_two_dataset"
    features_dir = "features"

    print("Phase 1: Extracting features...")
    extract_and_save_features(base_path, features_dir)

Phase 1: Extracting features...


  data = torch.load(f"{base_path}/train_data/{i}_train_data.tar.pth")


Processing dataset D11...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]
  test_data = torch.load(f"{base_path}/eval_data/{i}_eval_data.tar.pth")


Processing test dataset D11_test...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing dataset D12...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing test dataset D12_test...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing dataset D13...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing test dataset D13_test...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing dataset D14...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing test dataset D14_test...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing dataset D15...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing test dataset D15_test...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing dataset D16...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing test dataset D16_test...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing dataset D17...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing test dataset D17_test...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing dataset D18...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing test dataset D18_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing dataset D19...


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]


Processing test dataset D19_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing dataset D20...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]


Processing test dataset D20_test...


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]
