In [None]:
import torch

In [None]:
import pandas

In [None]:
# pip install timm

In [None]:
import torch
import torchvision
from PIL import Image
from torchvision import transforms as T
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import pydicom
import matplotlib.pyplot as plt
import os
import numpy as np
import torch.nn as nn
import timm
import zipfile
import pandas as pd 
import torch.optim as optim
import io

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(device)

## Creating the Model

In [None]:
# print(timm.list_models("swin*", pretrained = True))

model = timm.create_model('swin_base_patch4_window7_224', pretrained = True)

in_features = model.head.in_features
model.head = nn.Linear(in_features, 2)

for param in model.parameters():
    param.requires_grad = False
for param in model.head.parameters():
    param.requires_grad = True

print(model.head)


model.to(device)

In [None]:
for name, param in model.named_parameters():
    if name == 'head.weight' or name == 'head.bias':
        print(name, param.requires_grad)

## Prepare the dataset

In [None]:

zip_file = zipfile.ZipFile('/scratch/mmpate15/pe_classification/data/train/train.zip')
csv_file = zip_file.open('train.csv')

df = pd.read_csv(csv_file, index_col =False)
zip_file.close()

In [None]:
all_folder_names = sorted( os.listdir('/scratch/mmpate15/pe_classification/data/train/train'))

print(len(all_folder_names))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_study_ids, test_study_ids = train_test_split(all_folder_names, train_size=100, test_size=100, shuffle=True, random_state=42)

# Print the number of folders in each set
print(f'Number of folders in training set: {len(train_study_ids)}')
print(f'Number of folders in testing set: {len(test_study_ids)}')

## Training DataFrame

In [None]:
# Select rows that match the specified StudyID values for training data
mask = df['StudyInstanceUID'].isin(train_study_ids)
train_filtered_df = df[mask]
train_filtered_df = train_filtered_df.reset_index(drop=True)

In [None]:
X_df = train_filtered_df.iloc[:, :4]
X_df

In [None]:
class_counts = X_df["pe_present_on_image"].value_counts()
print('inital class counts: \n', class_counts)

In [None]:
numb = int(class_counts[1] + (class_counts[1]*0.2))
max_count_0 = min(numb, class_counts[0]) # Set a limit of 500 for class 0
max_count_1 = class_counts[1]  

train_df_filtered = pd.concat([X_df[X_df["pe_present_on_image"]==0][:max_count_0], X_df[X_df["pe_present_on_image"]==1][:max_count_1]])
train_df_filtered = train_df_filtered.reset_index(drop=True)
train_df_filtered = train_df_filtered.sample(frac=1).reset_index(drop=True)

(train_df_filtered)


In [None]:
## balanced dataframe class count

class_counts = train_df_filtered["pe_present_on_image"].value_counts()
print(class_counts)

## Testing DataFrame

In [None]:
# Select rows that match the specified StudyID values for testing data
test_mask = df['StudyInstanceUID'].isin(test_study_ids)
test_filtered_df = df[test_mask]
test_filtered_df = test_filtered_df.reset_index(drop=True)

In [None]:
y_df = test_filtered_df.iloc[:, :4]
y_df

In [None]:
class_counts = y_df["pe_present_on_image"].value_counts()
print('inital class counts: \n', class_counts)

In [None]:
numb =0
class_count = 0
max_count_0 = 0
max_count_1 = 0

In [None]:
numb = int(class_counts[1] + (class_counts[1]*0.2))
max_count_0 = min(numb, class_counts[0]) # Set a limit of 500 for class 0
max_count_1 = class_counts[1]  

test_df_filtered = pd.concat([y_df[y_df["pe_present_on_image"]==0][:max_count_0], y_df[y_df["pe_present_on_image"]==1][:max_count_1]])
# test_df_filtered = test_df_filtered.reset_index(drop=True)
test_df_filtered = test_df_filtered.sample(frac=1).reset_index(drop=True)

(test_df_filtered)

In [None]:
class_counts = test_df_filtered["pe_present_on_image"].value_counts()
print(class_counts)

## Creating Custom DataSet and Loader class

In [None]:
root_dir = '/scratch/mmpate15/pe_classification/data/train/train'

In [None]:
# data = X_df

# img_path = os.path.join(root_dir, data.iloc[1]['StudyInstanceUID'], data.iloc[1]['SeriesInstanceUID'],
#                                 data.iloc[1]['SOPInstanceUID'] + '.dcm').replace("\\", "/")

In [None]:
class MyDataset(Dataset):
    classes = [0, 1]

    def __init__(self, root_dir, df, transform):
        self.data = df
        self.transform = transform
        self.root_dir = root_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data.iloc[idx]['StudyInstanceUID'], self.data.iloc[idx]['SeriesInstanceUID'],
                                self.data.iloc[idx]['SOPInstanceUID'] + '.dcm').replace("\\", "/")
        
        
        dcm = pydicom.read_file(img_path).pixel_array
        img = Image.fromarray(np.uint8(dcm * 255), 'L')
        img = img.convert('RGB')

        if self.transform is not None:
            img = self.transform(img)

        # img = transforms.ToTensor()(img)



        label = torch.tensor(int(self.data['pe_present_on_image'][idx]))
        
        return img, label, img_path


In [None]:
transform = T.Compose([T.Resize(224),
                   T.ToTensor(),
                   T.Normalize(timm.data.IMAGENET_DEFAULT_MEAN, timm.data.IMAGENET_DEFAULT_STD )])

In [None]:
#### Loading the DataSet

train_dataset = MyDataset(root_dir, train_df_filtered, transform)
test_dataset = MyDataset(root_dir, test_df_filtered, transform)


train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=True)

## Visualizing the Data Set and class distribution

In [None]:
count = [0] * len(MyDataset.classes)  # initialize count to 0 for each class

for images, labels,_ in train_loader:
    images = images.to(device)
    # labels = labels.to(device)
    unique_labels, counts = np.unique(labels, return_counts=True)
    # print(labels)
    for i in range(len(unique_labels)):
        label = int(unique_labels[i])
        count[label] += counts[i]



print(images.size())
print('original labels:', labels)

# Visualize the class distribution using a bar plot
fig, ax = plt.subplots()
ax.bar(MyDataset.classes, count)
ax.set_xlabel('Class label')
ax.set_ylabel('Number of instances')
ax.set_title('Class Distribution')
for i, v in enumerate(count):
    ax.text(i, v+0, str(v), color='blue', ha='center')
plt.xticks([0,1])
plt.show()


In [None]:
### Test DataSet

In [None]:
count = [0] * len(MyDataset.classes)  # initialize count to 0 for each class

for images, labels,_ in test_loader:
    images = images.to(device)
    # labels = labels.to(device)
    unique_labels, counts = np.unique(labels, return_counts=True)
    # print(labels)
    for i in range(len(unique_labels)):
        label = int(unique_labels[i])
        count[label] += counts[i]



print(images.size())
print('original labels:', labels)

# Visualize the class distribution using a bar plot
fig, ax = plt.subplots()
ax.bar(MyDataset.classes, count)
ax.set_xlabel('Class label')
ax.set_ylabel('Number of instances')
ax.set_title('Class Distribution')
for i, v in enumerate(count):
    ax.text(i, v+0, str(v), color='blue', ha='center')
plt.xticks([0,1])
plt.show()

## Creating the training loop

In [None]:
learning_rate = 0.001
num_epochs = 30

# Define the class weights
# class_weights = torch.tensor([1.0, 30.0])
# class_weights = class_weights.to(device)

# Define the loss function with class weights
# weight=class_weights

criterion = nn.CrossEntropyLoss()

# Define the loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [None]:
### Creating the training loop


f1_list = []
train_loss_list = []
train_acc_list = []

for epoch in range(num_epochs):
    
    running_loss = 0.0
    running_corrects = 0.0
    y_true = []
    y_pred = []
    num_samples = 0.0    
    conf_matrix = [[0, 0], [0, 0]]


    model.train()

    for i, data in enumerate(train_loader):
        
        inputs, labels, _ = data
        inputs, labels = inputs.to(device), labels.to(device)

        
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs)
        # outputs = outputs.to(device)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()


         # Update statistics
        _, preds = torch.max(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        num_samples += inputs.size(0)
        
        # Update the confusion matrix
        conf_matrix += confusion_matrix(labels.cpu(), preds.cpu(), labels=[0, 1])

        # print('\n[%d, %5d] loss: %.3f, accuracy: %.3f' % (epoch + 1, i + 1, running_loss / num_samples, running_corrects / num_samples))
        
        # Collect predictions and true labels for f1 score calculation
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
    
    # Calculate F1 score
    f1 = f1_score(y_true, y_pred, average='weighted')
        

    epoch_loss = running_loss / len(train_dataset)
    epoch_acc = running_corrects / len(train_dataset)
    train_loss_list.append(epoch_loss)
    train_acc_list.append(epoch_acc)
    f1_list.append(f1)

    print('\nTrain Set: Epoch [%d/%d], Loss: %.4f, Accuracy: %.4f, F1: %.4f' % (epoch+1, num_epochs, epoch_loss, epoch_acc, f1))

print('Finished Training & saved the model')

print("\n Here is the testing confusion matrix: \n", conf_matrix)


In [None]:
# Save the model weights after training
torch.save(model.state_dict(), 'swin_transformer_classification.pth')

In [None]:
import matplotlib.pyplot as plt

# Example data
epochs = list(range(1, 11))
loss = [0.5457, 0.3868, 0.3157, 0.2729, 0.2442, 0.2178, 0.1992, 0.1808, 0.1732, 0.1651]
accuracy = [0.7500, 0.8801, 0.9107, 0.9318, 0.9291, 0.9463, 0.9487, 0.9553, 0.9522, 0.9604]
f1_score = [0.7489, 0.8802, 0.9107, 0.9319, 0.9291, 0.9463, 0.9487, 0.9553, 0.9522, 0.9604]

# Loss plot
plt.figure(figsize=(8, 6))
plt.plot(epochs, loss, '-o', linewidth=2)
plt.title('Training Loss', fontsize=14)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.grid(False)
plt.show()

# Accuracy plot
plt.figure(figsize=(8, 6))
plt.plot(epochs, accuracy, '-o', linewidth=2)
plt.title('Training Accuracy', fontsize=14)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.ylim([0, 1.0])
plt.grid(False)
plt.show()

# F1 score plot
plt.figure(figsize=(8, 6))
plt.plot(epochs, f1_score, '-o', linewidth=2)
plt.title('Training F1 Score', fontsize=14)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('F1 Score', fontsize=12)
plt.ylim([0, 1.0])
plt.grid(False)
plt.show()

In [None]:
### Testing the model

test_loss_list = []
test_acc_list = []
results = []


for epoch in range(1):
    
    test_loss = 0.0
    test_correct = 0.0
    total = 0.0    
    conf_matrix = [[0, 0], [0, 0]]

    model.eval()

    with torch.no_grad():
        for batch_idx, (inputs, targets, img_names) in enumerate(test_loader):
            # Forward pass
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Update loss
            test_loss += loss.item() * inputs.size(0)

            # Update accuracy
            _, predicted = torch.max(outputs.data, 1)
                        
            
            for i in range(len(predicted)):
                print("Image: {}, Prediction: {},".format(img_names[i],predicted[i].item()))

                results.append((img_names[i], predicted[i].item()))
                
            # Update the confusion matrix
            conf_matrix += confusion_matrix(targets.cpu(), predicted.cpu(), labels=[0, 1])


            total += targets.size(0)
            test_correct += (predicted == targets).sum().item()

    test_loss /= len(test_loader.dataset)
    test_acc = test_correct / len(test_dataset)
    test_loss_list.append(test_loss)
    test_acc_list.append(test_acc)

    print(f'Test set: Average loss: {test_loss:.4f}, Accuracy: {test_acc:.4f}')
    
print("\nFinished Testing the model")
    
print("\n Here is the testing confusion matrix: \n", conf_matrix)


In [None]:
model.load_state_dict(torch.load('/scratch/mmpate15/pe_classification/swin_transformer_classification.pth'))