In [1]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [6]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import numpy as np
import io
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

class ParquetDataset(Dataset):
    def __init__(self, file_path):
        self.data = pd.read_parquet(file_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_data = self.data.iloc[idx]['image']
        label = self.data.iloc[idx]['label']

        if isinstance(image_data, dict):
            if "bytes" in image_data:
                image = Image.open(io.BytesIO(image_data["bytes"]))
            else:
                raise ValueError("NO 'bytes'")
        else:
            raise ValueError("Format Error")

        image_tensor = transform(image)

        return image_tensor, label

file_paths = [
    'CRC_VAL_HE_7K-00000-of-00003-cce1097526b69125.parquet'
]

datasets = [ParquetDataset(file_path) for file_path in file_paths]

full_dataset = torch.utils.data.ConcatDataset(datasets)

batch_size = 32
data_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True)

for images, labels in data_loader:
    print(f'Image batch shape: {images.shape}')
    print(f'Label batch shape: {labels.shape}')
    break

Image batch shape: torch.Size([32, 3, 224, 224])
Label batch shape: torch.Size([32])


In [10]:
from torch.utils.data import DataLoader
import torch
from timm import create_model
def load_convnext_backbone():
    model = create_model("convnext_base", pretrained=True)
    model.reset_classifier(0)
    return model

device = "cuda" if torch.cuda.is_available() else "cpu"
backbone = load_convnext_backbone().to(device)
backbone.eval()

model.safetensors:   0%|          | 0.00/354M [00:00<?, ?B/s]

ConvNeXt(
  (stem): Sequential(
    (0): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
    (1): LayerNorm2d((128,), eps=1e-06, elementwise_affine=True)
  )
  (stages): Sequential(
    (0): ConvNeXtStage(
      (downsample): Identity()
      (blocks): Sequential(
        (0): ConvNeXtBlock(
          (conv_dw): Conv2d(128, 128, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=128)
          (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_features=128, out_features=512, bias=True)
            (act): GELU()
            (drop1): Dropout(p=0.0, inplace=False)
            (norm): Identity()
            (fc2): Linear(in_features=512, out_features=128, bias=True)
            (drop2): Dropout(p=0.0, inplace=False)
          )
          (shortcut): Identity()
          (drop_path): Identity()
        )
        (1): ConvNeXtBlock(
          (conv_dw): Conv2d(128, 128, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), g

In [12]:

def extract_features(backbone, dataloader, device):
    features, labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            inputs, batch_labels = batch

            inputs = inputs.to(device)
            batch_labels = batch_labels.to(device)

            outputs = backbone(inputs)
            features.extend(outputs.cpu().numpy())
            labels.extend(batch_labels.cpu().numpy())
    return features, labels

train_features, train_labels = extract_features(backbone, data_loader, device)

print(f"Extracted feature shape: {np.array(train_features).shape}")
print(f"Extracted labels shape: {np.array(train_labels).shape}")


Extracted feature shape: (2394, 1024)
Extracted labels shape: (2394,)


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [43]:

train_features_np = np.array(train_features)
train_labels_np = np.array(train_labels)

input_dim = train_features_np.shape[1]
num_classes = np.max(train_labels_np) + 1
print(f"Number of classes (num_classes): {num_classes}")



Number of classes (num_classes): 8


In [40]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


In [44]:

class FullyConnectedModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(FullyConnectedModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.fc(x)

train_features_np, val_features_np, train_labels_np, val_labels_np = train_test_split(train_features_np, train_labels_np, test_size=0.1, random_state=42)

train_features_tensor = torch.tensor(train_features_np, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels_np, dtype=torch.long)
val_features_tensor = torch.tensor(val_features_np, dtype=torch.float32)
val_labels_tensor = torch.tensor(val_labels_np, dtype=torch.long)

train_dataset = TensorDataset(train_features_tensor, train_labels_tensor)
val_dataset = TensorDataset(val_features_tensor, val_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_dim = train_features_np.shape[1]
# num_classes = len(np.unique(train_labels_np))+1
model = FullyConnectedModel(input_dim=input_dim, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)

        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss / len(train_loader):.4f}")

print("训练完成！")

model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)

        _, preds = torch.max(outputs, 1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

y_pred = np.array(all_preds)
y_true = np.array(all_labels)

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_true, y_pred)
class_report = classification_report(y_true, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


Epoch [1/10], Loss: 1.3392
Epoch [2/10], Loss: 1.2762
Epoch [3/10], Loss: 1.2750
Epoch [4/10], Loss: 1.2747
Epoch [5/10], Loss: 1.2743
Epoch [6/10], Loss: 1.2742
Epoch [7/10], Loss: 1.2741
Epoch [8/10], Loss: 1.2741
Epoch [9/10], Loss: 1.2741
Epoch [10/10], Loss: 1.2741
训练完成！
Accuracy: 1.0000
F1 Score: 1.0000
Precision: 1.0000
Recall: 1.0000
Confusion Matrix:
[[133   0   0   0]
 [  0  32   0   0]
 [  0   0  31   0]
 [  0   0   0  44]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       133
           2       1.00      1.00      1.00        32
           3       1.00      1.00      1.00        31
           7       1.00      1.00      1.00        44

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240



In [51]:

class FullyConnectedModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(FullyConnectedModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.fc(x)

train_features_np, val_features_np, train_labels_np, val_labels_np = train_test_split(train_features_np, train_labels_np, test_size=0.1, random_state=42)

train_features_tensor = torch.tensor(train_features_np, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels_np, dtype=torch.long)
val_features_tensor = torch.tensor(val_features_np, dtype=torch.float32)
val_labels_tensor = torch.tensor(val_labels_np, dtype=torch.long)

train_dataset = TensorDataset(train_features_tensor, train_labels_tensor)
val_dataset = TensorDataset(val_features_tensor, val_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_dim = train_features_np.shape[1]
# num_classes = len(np.unique(train_labels_np))+1
model = FullyConnectedModel(input_dim=input_dim, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)

        _, preds = torch.max(outputs, 1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

y_pred = np.array(all_preds)
y_true = np.array(all_labels)

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_true, y_pred)
class_report = classification_report(y_true, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


Accuracy: 0.2420
F1 Score: 0.1091
Precision: 0.0704
Recall: 0.2420
Confusion Matrix:
[[ 0  0  1  2 17  2 69]
 [ 0  0  0  0  0  0 19]
 [ 0  0  0  0  0  0  8]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0 38]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        91
           2       0.00      0.00      0.00        19
           3       0.00      0.00      0.00         8
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.28      0.97      0.44        39

    accuracy                           0.24       157
   macro avg       0.04      0.14      0.06       157
weighted avg       0.07      0.24      0.11       157



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
