In [9]:
import os
import zipfile

# Correct directory paths
base_dir = "C:/Users/yeshwanth/Downloads/project yy/dataverse_files"
images_dir = os.path.join(base_dir, "images")
segmentations_dir = os.path.join(base_dir, "segmentations")

# Make directories
os.makedirs(images_dir, exist_ok=True)
os.makedirs(segmentations_dir, exist_ok=True)

# Function to extract zip files
def extract_zip(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Extract image zip files
extract_zip("C:/Users/yeshwanth/Downloads/project yy/dataverse_files/HAM10000_images_part_1.zip", images_dir)
extract_zip("C:/Users/yeshwanth/Downloads/project yy/dataverse_files/HAM10000_images_part_2.zip", images_dir)

# Extract segmentation zip file
extract_zip("C:/Users/yeshwanth/Downloads/project yy/dataverse_files/HAM10000_segmentations_lesion_tschandl.zip", segmentations_dir)

# Verify extraction by listing sample files
print("Sample extracted images:", os.listdir(images_dir)[:5])
print("Sample extracted segmentations:", os.listdir(segmentations_dir)[:5])

Sample extracted images: ['ISIC_0024306.jpg', 'ISIC_0024307.jpg', 'ISIC_0024308.jpg', 'ISIC_0024309.jpg', 'ISIC_0024310.jpg']
Sample extracted segmentations: ['HAM10000_segmentations_lesion_tschandl', '__MACOSX']


In [19]:
# Clean Up __MACOSX Files if Present
import shutil

# Remove unnecessary __MACOSX directory if it exists
macosx_dir = 'C:/Users/yeshwanth/Downloads/project yy/dataverse_files/data/segmentations/__MACOSX'
if os.path.exists(macosx_dir):
    shutil.rmtree(macosx_dir)
    print("Removed __MACOSX directory from segmentations folder.")

# Confirm cleaned directories
print("Final list of files in images folder:", os.listdir(images_dir)[:5])
print("Final list of files in segmentations folder:", os.listdir(images_dir)[:5])

Final list of files in images folder: ['ISIC_0024306.jpg', 'ISIC_0024307.jpg', 'ISIC_0024308.jpg', 'ISIC_0024309.jpg', 'ISIC_0024310.jpg']
Final list of files in segmentations folder: ['ISIC_0024306.jpg', 'ISIC_0024307.jpg', 'ISIC_0024308.jpg', 'ISIC_0024309.jpg', 'ISIC_0024310.jpg']


In [20]:
# Import necessary libraries for image and mask visualization
import cv2
import matplotlib.pyplot as plt
import os
from glob import glob

# Define directories for images and segmentation masks
image_dir = os.path.join(base_dir, "images")
segmentation_dir = 'D:/Dataset/yashwant/data/segmentations/HAM10000_segmentations_lesion_tschandl'

# Select a few sample images and corresponding masks
sample_image_files = os.listdir(image_dir)[:5]
sample_images = [os.path.join(image_dir, img) for img in sample_image_files]

# For each image, find the corresponding segmentation mask (assuming they have the same file name)
plt.figure(figsize=(15, 10))
for i, image_path in enumerate(sample_images):
    # Load the image
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB for display

    # Load the corresponding mask (assumes masks are named identically to images)
    mask_name = os.path.basename(image_path).replace('.jpg', '_segmentation.png')
    mask_path = os.path.join(segmentation_dir, mask_name)

    # Check if the mask file exists
    if os.path.exists(mask_path):
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

        # Plot image and mask side by side
        plt.subplot(5, 2, i * 2 + 1)
        plt.imshow(image_rgb)
        plt.title("Image")
        plt.axis('off')

        plt.subplot(5, 2, i * 2 + 2)
        plt.imshow(mask, cmap='gray')
        plt.title("Segmentation Mask")
        plt.axis('off')
    else:
        print(f"Mask for {image_path} not found.")

plt.suptitle("Sample Images with Corresponding Segmentation Masks")
plt.show()

ModuleNotFoundError: No module named 'cv2'

In [15]:
import pandas as pd

# Load the metadata file
metadata = pd.read_csv('C:/Users/yeshwanth/Downloads/project yy/dataverse_files/HAM10000_metadata')

# Display the first few rows of the metadata
metadata_head = metadata.head()
metadata_head

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern


In [None]:
# Check the overall structure of the DataFrame
metadata.info()

In [None]:
# Display missing values
missing_values = metadata.isnull().sum()
missing_values

In [None]:
# Fill missing age values with the median
metadata['age'] = metadata['age'].fillna(metadata['age'].median())

# Confirm that missing values are handled
print("\nAfter Imputation:")
print(metadata.isnull().sum())

In [None]:
# Summary for numeric columns
print("\nSummary of Numeric Columns:")
print(metadata.describe(include=[float]))

In [None]:
# Summary for non-numeric columns
print("\nSummary of Non-Numeric Columns:")
print(metadata.describe(include=[object]))

In [None]:
# Check unique values in categorical columns
print("\nUnique Values in Categorical Columns:")
for col in metadata.select_dtypes(include=[object]).columns:
    print(f"{col}: {metadata[col].nunique()} unique values")

In [None]:
import seaborn as sns

# Lesion Type Distribution as a Pie Chart
plt.figure(figsize=(8, 8))
metadata['dx'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140)
plt.title("Proportion of Lesion Types")
plt.ylabel("")  # Remove the y-axis label for a cleaner look
plt.show()

In [None]:
# Gender Distribution as a Donut Chart
gender_counts = metadata['sex'].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=0.3))
plt.title("Gender Distribution of Patients")
plt.show()

In [None]:
# Localization Distribution of Lesions as a Horizontal Bar Chart with Percentages
localization_counts = metadata['localization'].value_counts(normalize=True) * 100
plt.figure(figsize=(10, 8))
localization_counts.sort_values().plot(kind='barh')
plt.title("Localization of Lesions by Percentage")
plt.xlabel("Percentage")
plt.ylabel("Body Part")
plt.show()

In [None]:
# Age Distribution across Lesion Types as a Violin Plot
plt.figure(figsize=(12, 6))
sns.violinplot(data=metadata, x='dx', y='age')
plt.title("Age Distribution by Lesion Type")
plt.xlabel("Lesion Type")
plt.ylabel("Age")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Diagnosis Type Proportion (Pie chart to reflect diagnostic methods used)
plt.figure(figsize=(8, 8))
metadata['dx_type'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=['#99ff99', '#ffcc99', '#ff6666'])
plt.title("Proportion of Diagnostic Methods")
plt.ylabel("")
plt.show()

In [None]:
# Replace age = 0 with the median age
median_age = metadata['age'].median()
metadata['age'] = metadata['age'].replace(0, median_age)

# The distribution of age
plt.figure(figsize=(8, 6))
sns.histplot(metadata['age'], bins=20, kde=True)
plt.title("Age Distribution After Handling Missing Values")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
categorical_cols = ['dx', 'dx_type', 'sex', 'localization']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    metadata[col] = le.fit_transform(metadata[col])
    label_encoders[col] = le

# Check the first few rows to verify encoding
print(metadata.head())

In [None]:
from sklearn.model_selection import train_test_split

# Define features and labels
X = metadata[['dx', 'dx_type', 'age', 'sex', 'localization']]  # Metadata features
y = metadata['dx']  # Labels (encoded diagnosis)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Verify the split
print(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

In [None]:
# Check class distribution in training and testing sets
print("Training Set Class Distribution:")
print(y_train.value_counts(normalize=True))

print("\nTesting Set Class Distribution:")
print(y_test.value_counts(normalize=True))

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os

# Ensure image paths are correctly formatted
metadata['image_id'] = metadata['image_id'].apply(lambda x: f"{x}.jpg" if not x.endswith('.jpg') else x)

# Define the image directory
image_dir = os.path.join(base_dir, "images")

# Ensure the dx column contains strings
metadata['dx'] = metadata['dx'].astype(str)

# Define the image generator
datagen = ImageDataGenerator(
    rescale=1./255,           # Normalize pixel values to [0, 1]
    rotation_range=20,        # Randomly rotate images
    width_shift_range=0.2,    # Randomly shift images horizontally
    height_shift_range=0.2,   # Randomly shift images vertically
    zoom_range=0.2,           # Randomly zoom images
    horizontal_flip=True,     # Randomly flip images horizontally
    validation_split=0.2      # Reserve 20% of data for validation
)

# Training generator
train_generator = datagen.flow_from_dataframe(
    dataframe=metadata,
    directory=image_dir,
    x_col='image_id',
    y_col='dx',
    target_size=(128, 128),    # Resize images to 128x128 pixels
    batch_size=32,             # Batch size
    class_mode='categorical',  # Multi-class classification
    subset='training',         # Use the training subset
    shuffle=True               # Shuffle data
)

# Validation generator
val_generator = datagen.flow_from_dataframe(
    dataframe=metadata,
    directory=image_dir,
    x_col='image_id',
    y_col='dx',
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Define CNN model
model = Sequential([
    # First convolutional block
    Input(shape=(128, 128, 3)),
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    MaxPooling2D((2, 2)),

    # Second convolutional block
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),

    # Third convolutional block
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),

    # Flatten and fully connected layers
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),  # Dropout for regularization
    Dense(7, activation='softmax')  # 7 classes for skin lesion types
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights for imbalanced data
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(metadata['dx']),
    y=metadata['dx']
)
class_weights_dict = dict(enumerate(class_weights))

# Train the model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,                    # Number of epochs
    steps_per_epoch=len(train_generator),
    validation_steps=len(val_generator),
    class_weight=class_weights_dict  # Add class weights to balance training
)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluate model performance
test_loss, test_acc = model.evaluate(val_generator)
print(f"Test Accuracy: {test_acc:.2f}")

# Predict on the validation data
y_pred = model.predict(val_generator)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = val_generator.classes

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred_classes))

# Confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=val_generator.class_indices.keys(),
            yticklabels=val_generator.class_indices.keys())
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

NameError: name 'model' is not defined

In [None]:
# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout

# Load the pre-trained VGG16 model without the top layer
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Add custom layers on top
x = Flatten()(base_model.output)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(7, activation='softmax')(x)  # 7 classes

# Create the model
model_vgg16 = Model(inputs=base_model.input, outputs=output)

# Compile the model
model_vgg16.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
model_vgg16.summary()

In [None]:
# Train the model
history_vgg16 = model_vgg16.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    steps_per_epoch=len(train_generator),
    validation_steps=len(val_generator),
    class_weight=class_weights_dict
)

In [None]:
from tensorflow.keras.applications import ResNet50

# Load the pre-trained ResNet50 model without the top layer
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Add custom layers on top
x = Flatten()(base_model.output)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(7, activation='softmax')(x)  # 7 classes

# Create the model
model_resnet50 = Model(inputs=base_model.input, outputs=output)

# Compile the model
model_resnet50.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
model_resnet50.summary()

In [13]:
# Train the model
history_resnet50 = model_resnet50.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    steps_per_epoch=len(train_generator),
    validation_steps=len(val_generator),
    class_weight=class_weights_dict
)

NameError: name 'model_resnet50' is not defined