In [None]:
import torch
import torchvision
from PIL import Image
from torchvision import transforms as T
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import pydicom
import matplotlib.pyplot as plt
import os
import numpy as np
import torch.nn as nn
import timm
import zipfile
import pandas as pd 
import torch.optim as optim
import io
import time

In [None]:
num_gpus = torch.cuda.device_count() 
num_gpus

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(device)

In [None]:
from torchvision.models import resnet50, ResNet50_Weights
from torchvision import models

model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)  # deprecated


In [None]:
import torch.nn as nn

num_classes = 2

num_ftrs = model.fc.in_features

# Replace the last layer with a new fully connected layer with the required number of output classes
model.fc = nn.Linear(num_ftrs, num_classes)

In [None]:
# model.to(device)
model= nn.DataParallel(model)
model.to(device)

In [None]:
zip_file = zipfile.ZipFile('/scratch/mmpate15/pe_classification/data/train/train.zip')
csv_file = zip_file.open('train.csv')

df = pd.read_csv(csv_file, index_col =False)
zip_file.close()

In [None]:
all_folder_names = sorted( os.listdir('/scratch/mmpate15/pe_classification/data/train/train'))

print(len(all_folder_names))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_study_ids, test_study_ids = train_test_split(all_folder_names, train_size=350, test_size=100, shuffle=True, random_state=42)

# Print the number of folders in each set
print(f'Number of folders in training set: {len(train_study_ids)}')
print(f'Number of folders in testing set: {len(test_study_ids)}')

In [None]:
# Select rows that match the specified StudyID values for training data
mask = df['StudyInstanceUID'].isin(train_study_ids)
train_filtered_df = df[mask]
train_filtered_df = train_filtered_df.reset_index(drop=True)

In [None]:
X_df = train_filtered_df.iloc[:, :4]
X_df

In [None]:
class_counts = X_df["pe_present_on_image"].value_counts()
print('inital class counts: \n', class_counts)

In [None]:
numb = int(class_counts[1] + (class_counts[1]*0.65))
max_count_0 = min(numb, class_counts[0]) # Set a limit of 500 for class 0
max_count_1 = class_counts[1]  

train_df_filtered = pd.concat([X_df[X_df["pe_present_on_image"]==0][:max_count_0], X_df[X_df["pe_present_on_image"]==1][:max_count_1]])
train_df_filtered = train_df_filtered.reset_index(drop=True)
train_df_filtered = train_df_filtered.sample(frac=1).reset_index(drop=True)

(train_df_filtered)


In [None]:
## balanced dataframe class count

class_counts = train_df_filtered["pe_present_on_image"].value_counts()
print(class_counts)

In [None]:
class_counts = X_df["pe_present_on_image"].value_counts()
print('inital class counts: \n', class_counts)

In [None]:
# Select rows that match the specified StudyID values for testing data
test_mask = df['StudyInstanceUID'].isin(test_study_ids)
test_filtered_df = df[test_mask]
test_filtered_df = test_filtered_df.reset_index(drop=True)

In [None]:
y_df = test_filtered_df.iloc[:, :4]
y_df

In [None]:
class_counts = y_df["pe_present_on_image"].value_counts()
print('inital class counts: \n', class_counts)

In [None]:
numb =0
class_count = 0
max_count_0 = 0
max_count_1 = 0

In [None]:
numb = int(class_counts[1] + (class_counts[1]*0.65))
max_count_0 = min(numb, class_counts[0]) # Set a limit of 500 for class 0
max_count_1 = class_counts[1]  

test_df_filtered = pd.concat([y_df[y_df["pe_present_on_image"]==0][:max_count_0], y_df[y_df["pe_present_on_image"]==1][:max_count_1]])
# test_df_filtered = test_df_filtered.reset_index(drop=True)
test_df_filtered = test_df_filtered.sample(frac=1).reset_index(drop=True)

(test_df_filtered)

In [None]:
class_counts = test_df_filtered["pe_present_on_image"].value_counts()
print(class_counts)


In [None]:
root_dir = '/scratch/mmpate15/pe_classification/data/train/train'

In [None]:
class MyDataset(Dataset):
    classes = [0, 1]

    def __init__(self, root_dir, df, transform):
        self.data = df
        self.transform = transform
        self.root_dir = root_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data.iloc[idx]['StudyInstanceUID'], self.data.iloc[idx]['SeriesInstanceUID'],
                                self.data.iloc[idx]['SOPInstanceUID'] + '.dcm').replace("\\", "/")
        
        
        dcm = pydicom.read_file(img_path).pixel_array
        img = Image.fromarray(np.uint8(dcm * 255), 'L')
        img = img.convert('RGB')

        if self.transform is not None:
            img = self.transform(img)

        # img = transforms.ToTensor()(img)



        label = torch.tensor(int(self.data['pe_present_on_image'][idx]))
        
        return img, label, img_path


In [None]:
transform = T.Compose([T.Resize(224),
                   T.ToTensor(),
                   T.Normalize(timm.data.IMAGENET_DEFAULT_MEAN, timm.data.IMAGENET_DEFAULT_STD )])

In [None]:
#### Loading the DataSet

train_dataset = MyDataset(root_dir, train_df_filtered, transform)
test_dataset = MyDataset(root_dir, test_df_filtered, transform)


train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=True)

In [None]:
import pandas as pd
import seaborn as sns

class_counts = train_df_filtered['pe_present_on_image'].value_counts()

# Create a countplot to visualize the class distribution
sns.countplot(x='pe_present_on_image', data=train_df_filtered)
plt.title('Training Set Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')

# Display the plot
plt.show()

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [None]:
learning_rate = 0.001
num_epochs = 20

# Define the class weights
# class_weights = torch.tensor([1.0, 30.0])
# class_weights = class_weights.to(device)

# Define the loss function with class weights
# weight=class_weights

criterion = nn.CrossEntropyLoss()

# Define the loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
### Creating the training loop

f1_list = []
train_loss_list = []
train_acc_list = []
y_true = []
y_pred = []
START = time.time()
print(START)


for epoch in range(num_epochs):
    
    running_loss = 0.0
    running_corrects = 0.0

    num_samples = 0.0    
    conf_matrix = [[0, 0], [0, 0]]


    model.train()

    for i, data in enumerate(train_loader):
        
        inputs, labels, _ = data
        inputs, labels = inputs.to(device), labels.to(device)

        
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs)
        # outputs = outputs.to(device)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()


         # Update statistics
        _, preds = torch.max(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        num_samples += inputs.size(0)
        
        # Update the confusion matrix
        conf_matrix += confusion_matrix(labels.cpu(), preds.cpu(), labels=[0, 1])

        # print('\n[%d, %5d] loss: %.3f, accuracy: %.3f' % (epoch + 1, i + 1, running_loss / num_samples, running_corrects / num_samples))
        
        # Collect predictions and true labels for f1 score calculation
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
    
    # Calculate F1 score
    f1 = f1_score(y_true, y_pred, average='weighted')
        

    epoch_loss = running_loss / len(train_dataset)
    epoch_acc = running_corrects / len(train_dataset)
    train_loss_list.append(epoch_loss)
    train_acc_list.append(epoch_acc)
    f1_list.append(f1)

    print('\nTrain Set: Epoch [%d/%d], Loss: %.4f, Accuracy: %.4f, F1: %.4f' % (epoch+1, num_epochs, epoch_loss, epoch_acc, f1))

print('Finished Training & saved the model')

torch.save(model.state_dict(), "resnet50_model_large.pth")
print("\nModel saved to model.pth")

End = time.time()
print(End)

print("\n Here is the testing confusion matrix: \n", conf_matrix)


In [None]:
from sklearn.metrics import roc_curve, auc


In [None]:
y_pred = np.array(y_pred)
y_true = np.array(y_true)

# Calcualte the ROC curve and AUC score
fpr, tpr, threshold = roc_curve(y_true, y_pred[:])
roc_auc = auc(fpr, tpr)
print("ROC AUC score: ", roc_auc)

# Plot the ROC curve
plt.plot(fpr, tpr, lw=1, alpha=1, label='ROC (AUC = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--', lw=1, color='gray', alpha=.8)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# assume train_acc_list is a list of CUDA tensors
train_acc_list = train_acc_list

# move tensors to CPU and convert to NumPy arrays
train_acc_array = [t.cpu().numpy() for t in train_acc_list]

# stack NumPy arrays into a single 2D array
train_acc_array = np.stack(train_acc_array)

print(train_acc_array)

In [None]:
epochs = range(1, 16)  # assuming you trained for 15 epochs
plt.plot(epochs, train_loss_list, label='Train Loss')
plt.title('Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
accuracy_list = [0.7797, 0.9441, 0.9781, 0.9867, 0.9907, 0.9874, 0.9911, 0.9913, 0.9940, 0.9853, 0.9968, 0.9911, 0.9956, 0.9971, 0.9953]
epochs = range(1, len(accuracy_list) + 1)

plt.plot(epochs, accuracy_list, label='Training Accuracy')
plt.title('Training Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
f1_scores = [0.7792, 0.8618, 0.9006, 0.9221, 0.9358, 0.9444, 0.9511, 0.9561, 0.9603, 0.9628, 0.9659, 0.9680, 0.9701, 0.9721, 0.9736]
epochs = range(1, 16)

plt.plot(epochs, f1_scores)
plt.xlabel('Epochs')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Epochs')
plt.show()

In [None]:
test_loss_list = []
test_acc_list = []
results = []
y_true_test=[]
y_pred_test=[]

for epoch in range(1):
    
    test_loss = 0.0
    test_correct = 0.0
    total = 0.0    
    conf_matrix = [[0, 0], [0, 0]]

    model.eval()

    with torch.no_grad():
        for batch_idx, (inputs, targets, img_names) in enumerate(test_loader):
            # Forward pass
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Update loss
            test_loss += loss.item() * inputs.size(0)

            # Update accuracy
            _, predicted = torch.max(outputs.data, 1)
            
            # Collect predictions and true labels for f1 score calculation
            y_true_test.extend(targets.cpu().numpy())
            y_pred_test.extend(predicted.cpu().numpy())
            
            
            for i in range(len(predicted)):
                # print("Image: {}, Prediction: {},".format(img_names[i],predicted[i].item()))

                results.append((img_names[i], predicted[i].item()))
                
            # Update the confusion matrix
            conf_matrix += confusion_matrix(targets.cpu(), predicted.cpu(), labels=[0, 1])


            total += targets.size(0)
            test_correct += (predicted == targets).sum().item()

    test_loss /= len(test_loader.dataset)
    test_acc = test_correct / len(test_dataset)
    test_loss_list.append(test_loss)
    test_acc_list.append(test_acc)

    print(f'Test set: Average loss: {test_loss:.4f}, Accuracy: {test_acc:.4f}')
    
print("\nFinished Testing the model")
    
print("\n Here is the testing confusion matrix: \n", conf_matrix)

In [None]:
test_loss_list

In [None]:
y_pred_test = np.array(y_pred_test)
y_true_test = np.array(y_true_test)

# Calcualte the ROC curve and AUC score
fpr, tpr, threshold = roc_curve(y_true, y_pred[:])
roc_auc = auc(fpr, tpr)
print("ROC AUC score: ", roc_auc)

# Plot the ROC curve
plt.plot(fpr, tpr, lw=1, alpha=1, label='ROC (AUC = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--', lw=1, color='gray', alpha=.8)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()