In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/brain-tumor-csv-file/brain_tumor_metadata.csv


In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt

# Load the CSV file
csv_path = "/kaggle/input/brain-tumor-csv-file/brain_tumor_metadata.csv"  # Replace with your actual CSV file path
df = pd.read_csv(csv_path)

# 1. Display the first few rows of the dataset
print("🔹 First 5 rows of the dataset:")
print(df.head())

# 2. Check dataset structure and column data types
print("\n🔹 Dataset Info:")
print(df.info())

# 3. Check the number of rows and columns
print("\n🔹 Dataset Shape (rows, columns):", df.shape)

# 4. Check for missing values
print("\n🔹 Missing Values in Each Column:")
print(df.isnull().sum())

# 5. Check for duplicate records
duplicates = df.duplicated().sum()
print("\n🔹 Number of duplicate records:", duplicates)

# 6. Check class distribution (balance check)
if 'label' in df.columns:
    print("\n🔹 Class Distribution:")
    print(df['label'].value_counts())

    # Plot class distribution
    plt.figure(figsize=(8,5))
    df['label'].value_counts().plot(kind='bar', color=['blue', 'green', 'red', 'purple'])
    plt.title("Class Distribution")
    plt.xlabel("Tumor Type")
    plt.ylabel("Number of Images")
    plt.xticks(rotation=45)
    plt.show()
else:
    print("\n⚠️ 'label' column not found in dataset!")

# 7. Verify if image paths exist (Assuming 'image_path' column contains file paths)
if 'image_path' in df.columns:
    missing_images = df[~df['image_path'].apply(os.path.exists)]
    print("\n🔹 Missing Image Files:", len(missing_images))
else:
    print("\n⚠️ 'image_path' column not found in dataset!")


🔹 First 5 rows of the dataset:
  Image_ID                                          File_Path Class_Label  \
0   I00001  /kaggle/input/okendo-xai/BRAIN_TUMOR/training/...   pituitary   
1   I00002  /kaggle/input/okendo-xai/BRAIN_TUMOR/training/...   pituitary   
2   I00003  /kaggle/input/okendo-xai/BRAIN_TUMOR/training/...   pituitary   
3   I00004  /kaggle/input/okendo-xai/BRAIN_TUMOR/training/...   pituitary   
4   I00005  /kaggle/input/okendo-xai/BRAIN_TUMOR/training/...   pituitary   

   Intensity    Texture   Size  Volume Resolution Dataset_Split  
0  50.163914  44.132675  28175  262144    512x512         Train  
1  47.278263  39.585857  34065  262144    512x512         Train  
2  61.069168  49.685964  25068   65536    256x256         Train  
3  45.979027  47.338543  32607  262144    512x512         Train  
4  61.401184  42.360812  11505   65536    256x256         Train  

🔹 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16375 entries, 0 to 16374
Data columns (tot

In [5]:
from scipy.stats import zscore
### 1. Detect Outliers in Numeric Columns using IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers

# Apply to numerical columns if any exist
numeric_columns = df.select_dtypes(include=np.number).columns
for col in numeric_columns:
    outliers = detect_outliers_iqr(df, col)
    if not outliers.empty:
        print(f"\n⚠️ Outliers detected in '{col}' (IQR Method): {len(outliers)} samples")

### 2. Detect Outliers using Z-score Analysis
def detect_outliers_zscore(data, column, threshold=3):
    z_scores = np.abs(zscore(data[column]))
    return data[z_scores > threshold]

for col in numeric_columns:
    outliers = detect_outliers_zscore(df, col)
    if not outliers.empty:
        print(f"\n⚠️ Outliers detected in '{col}' (Z-score Method): {len(outliers)} samples")

### 3. Check for Outliers in Image Properties
if 'image_path' in df.columns:
    image_sizes = []
    for path in df['image_path']:
        if os.path.exists(path):
            with Image.open(path) as img:
                image_sizes.append(img.size[0] * img.size[1])  # Image area (Width x Height)
        else:
            image_sizes.append(np.nan)

    df['image_size'] = image_sizes

    # Detect outliers in image sizes
    outliers = detect_outliers_iqr(df, 'image_size')
    if not outliers.empty:
        print(f"\n⚠️ Outliers detected in image sizes: {len(outliers)} images")

    # Plot image size distribution
    plt.figure(figsize=(8, 5))
    sns.boxplot(x=df['image_size'])
    plt.title("Image Size Distribution")
    plt.show()
else:
    print("\n⚠️ 'image_path' column not found in dataset!")

print("\n✅ Outlier detection complete!")



⚠️ Outliers detected in 'Intensity' (IQR Method): 854 samples

⚠️ Outliers detected in 'Texture' (IQR Method): 1536 samples

⚠️ Outliers detected in 'Size' (IQR Method): 608 samples

⚠️ Outliers detected in 'Volume' (IQR Method): 88 samples

⚠️ Outliers detected in 'Intensity' (Z-score Method): 236 samples

⚠️ Outliers detected in 'Texture' (Z-score Method): 292 samples

⚠️ Outliers detected in 'Size' (Z-score Method): 143 samples

⚠️ Outliers detected in 'Volume' (Z-score Method): 90 samples

⚠️ 'image_path' column not found in dataset!

✅ Outlier detection complete!


In [9]:
# Check the shape of the dataset
print("🔹 Shape of the Dataset:")
print(df.shape)


🔹 Shape of the Dataset:
(16375, 9)
