In [119]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from torchvision import models
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
import torch.nn.functional as F

# Define the dataset class
class MRNetDataset(Dataset):
    def __init__(self, data_dir, csv_files):
        self.data_dir = data_dir
        # Load the CSV files and combine them
        self.labels = []
        for file in csv_files:
            temp = pd.read_csv(file, sep=",", header=None, names=["file_id", "label"])
            self.labels.append(temp)
        self.labels = pd.concat(self.labels, axis=0)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        # Extract file_id and label
        file_id = self.labels.iloc[idx]["file_id"]
        label = self.labels.iloc[idx]["label"]
        
        # Clean and parse file_id
        try:
            # Remove unexpected characters (e.g., commas) and convert to integer
            file_id = int(str(file_id).split(',')[0].strip())
        except ValueError:
            raise ValueError(f"File ID '{file_id}' cannot be converted to an integer. Check your labels CSV file.")
        
        # Build file path and load .npy file
        file_path = os.path.join(self.data_dir, f"{file_id:04d}.npy")
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        
        data = np.load(file_path)
        
        # Normalize data and add channel dimension
        data = (data - np.min(data)) / (np.max(data) - np.min(data))  # Normalize
        data = np.expand_dims(data, axis=0)  # Add channel dimension (C, H, W)
        
        return torch.tensor(data, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)


# Feature extraction function
def extract_features(data_loader, model):
    features, labels = [], []
    model.eval()
    with torch.no_grad():
        for images, targets in data_loader:
            images = images.to(device)
            outputs = model(images)
            features.append(outputs.cpu().numpy())
            labels.append(targets.numpy())
    return np.vstack(features), np.hstack(labels)

# Paths
train_axial_dir = r"C:\Users\ajinf\Documents\DS 5220\Projects\SML-Project\MRNet-v1.0\train\axial"
valid_axial_dir = r"C:\Users\ajinf\Documents\DS 5220\Projects\SML-Project\MRNet-v1.0\valid\axial"
train_labels_csv_files = [
    r"C:\Users\ajinf\Documents\DS 5220\Projects\SML-Project\MRNet-v1.0\train-abnormal.csv",
    r"C:\Users\ajinf\Documents\DS 5220\Projects\SML-Project\MRNet-v1.0\train-acl.csv",
    r"C:\Users\ajinf\Documents\DS 5220\Projects\SML-Project\MRNet-v1.0\train-meniscus.csv"
]
valid_labels_csv_files = [
    r"C:\Users\ajinf\Documents\DS 5220\Projects\SML-Project\MRNet-v1.0\valid-abnormal.csv",
    r"C:\Users\ajinf\Documents\DS 5220\Projects\SML-Project\MRNet-v1.0\valid-acl.csv",
    r"C:\Users\ajinf\Documents\DS 5220\Projects\SML-Project\MRNet-v1.0\valid-meniscus.csv"
]

# Dataset and DataLoader
train_dataset = MRNetDataset(train_axial_dir, train_labels_csv_files)
valid_dataset = MRNetDataset(valid_axial_dir, valid_labels_csv_files)

def resize_images(images, target_height):
    batch_size, channels, height, width = images.size()
    
    # Permute the dimensions to have the height and channel dimensions in the second and third place
    images = images.permute(0, 2, 1, 3)  # (batch_size, height, channels, width)
    
    # Resize the second dimension (height) to the target value
    resized_images = F.interpolate(images, size=(target_height, width), mode='bilinear', align_corners=False)
    
    # Permute back to the original order (batch_size, channels, target_height, width)
    resized_images = resized_images.permute(0, 2, 1, 3)
    return resized_images
    
def custom_collate_fn(batch):
    # Assuming batch is a list of (image, label) tuples
    images, labels = zip(*batch)
    images_all = []
    for img in images:
        resized_images = resize_images(img, 20)
        images_all.append(resized_images)
        
    images = [transforms.Resize((256, 256))(img) for img in images_all]  # Resize all images
    images = torch.stack(images, 0)
    images = images[:, :, :, :, 0]  # Keep the first element along the last dimension
    labels = torch.tensor(labels)  # Adjust as needed for your labels
    return images, labels

train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=custom_collate_fn, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, collate_fn=custom_collate_fn, shuffle=False)

# Load pre-trained CNN (ResNet18)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cnn_model = models.resnet18(pretrained=True)
cnn_model.conv1 = torch.nn.Conv2d(1, cnn_model.conv1.out_channels, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
cnn_model.fc = nn.Identity()  # Remove the final classification layer
cnn_model = cnn_model.to(device)

# Extract features
print("Extracting features for training data...")
train_features, train_labels = extract_features(train_loader, cnn_model)

print("Extracting features for validation data...")
valid_features, valid_labels = extract_features(valid_loader, cnn_model)

# Train Logistic Regression
print("Training Logistic Regression model...")
log_reg = LogisticRegression()
log_reg.fit(train_features, train_labels)

# Evaluate
valid_predictions = log_reg.predict(valid_features)

print(valid_labels, valid_predictions)
print("Validation Accuracy:", accuracy_score(valid_labels, valid_predictions))
print(classification_report(valid_labels, valid_predictions))




Extracting features for training data...
Extracting features for validation data...
Training Logistic Regression model...
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1.
 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
