In [1]:
import pandas as pd 
import numpy as np 
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os


In [None]:
# Load the Excel file
file_path = "20 obs - 13.04.25/iridology-my1.xlsx"  # Adjust the path if needed
excel_data = pd.ExcelFile(file_path)

# Show sheet names
print("Sheet Names:", excel_data.sheet_names)

# Load data from the first sheet
df = excel_data.parse('Sheet1')

# Display first few rows
df.head()


Sheet Names: ['Sheet1']


Unnamed: 0,Image ID,Side,Main Category,Sub-Category,Analysis(*),(*) Overall conclusion and recommendation
0,1,Right,Cardiovascular system\nNervous system\nEndocri...,Arteriosclerosis\nAnxiety\nPancreas weakness,Presence of a stress ring indicates anxiety. P...,Support nerves with magnesium and calming herb...
1,2,Left,Nervous system\nCirculatory system,Emotional stress Anxiety\nVein and lymph cong...,The presence of radial furrows and a stress ri...,"Reduce stress through mindfulness, exercise, a..."
2,3,Right,Endocrine system\nNervous system\nRespiratory ...,Pancreas weakness\nInsulin resistance\nNeurolo...,"The body displays hormonal imbalances, particu...","Balance blood sugar with diet, herbs (cinnamon..."
3,4,Right,Endocrine System\nCardihjovascular system\nDig...,Adrenal exhustion\nBlood Circulation problem ...,A cholesterol ring suggests long-term cardiova...,"Take omega-3s for heart health, vitamin C and ..."
4,5,Right,Circulatory System\nEndocrine System\nNervous ...,Heart\nOvaries / Hormonal Balance Vitality\n,"Signs of stress on the heart, possibly from em...",Calm the nervous system with magnesium and dee...


In [3]:
# Preview key columns
df[['Main Category', 'Sub-Category']].head()


Unnamed: 0,Main Category,Sub-Category
0,Cardiovascular system\nNervous system\nEndocri...,Arteriosclerosis\nAnxiety\nPancreas weakness
1,Nervous system\nCirculatory system,Emotional stress Anxiety\nVein and lymph cong...
2,Endocrine system\nNervous system\nRespiratory ...,Pancreas weakness\nInsulin resistance\nNeurolo...
3,Endocrine System\nCardihjovascular system\nDig...,Adrenal exhustion\nBlood Circulation problem ...
4,Circulatory System\nEndocrine System\nNervous ...,Heart\nOvaries / Hormonal Balance Vitality\n


In [4]:
import pandas as pd
import re

# ✅ Combined typo + semantic fix mapping
fix_dict = {
    # MAIN FIXES
    'cardihjovascular system': 'cardiovascular system',
    'circulatory': 'circulatory system',
    'endocrine': 'endocrine system',
    'nervous': 'nervous system',
    'system': None,
    'vitality': None,  # remove from main if misplaced

    # SUB FIXES
    'adrenal exhustion': 'adrenal exhaustion',
    'emotional stress anxiety': ['emotional stress', 'anxiety'],
    'pancreas thyroid': ['pancreas', 'thyroid'],
    'hormonal balance vitality': ['hormonal balance', 'vitality'],
    'stress response rectum': ['stress response', 'rectum'],
    'thyroid glands     lower lungs': ['thyroid', 'lower lungs'],
    'thyroid                                      liver': ['thyroid', 'liver'],
    'stomach zone   spleen': ['stomach zone', 'spleen'],
    'haemoglobin (low iron) lymphatic rosary': ['hemoglobin', 'lymphatic rosary'],
    'lymph circulation     lungs': ['lymph circulation', 'lungs'],

    # Semantic merges
    'thyroid glands': 'thyroid',
    'legs': 'leg',
    'knee (left side)': 'knee',
    'stress rings': 'stress ring',
    'blood circulation problem': 'blood circulation',
    'circulatory': 'circulatory system',
}


# ✅ Final cleaner function
def final_clean(text):
    if pd.isna(text):
        return []

    # Initial split on common delimiters
    items = re.split(r'[\n/,]+', text)
    cleaned = []

    for item in items:
        item = item.strip().lower()

        # Apply fix map
        if item in fix_dict:
            replacement = fix_dict[item]
            if isinstance(replacement, list):
                cleaned.extend(replacement)
            elif isinstance(replacement, str):
                cleaned.append(replacement)
            # if None, skip it
        else:
            # Further clean if item has '&' or large spaces
            if '&' in item:
                cleaned.extend([x.strip() for x in item.split('&')])
            elif re.search(r'\s{2,}', item):
                cleaned.extend([x.strip() for x in re.split(r'\s{2,}', item)])
            else:
                cleaned.append(item)

    # Final cleanup: remove blanks, deduplicate
    return list(set([x for x in cleaned if x]))


In [5]:
df['Cleaned_Main'] = df['Main Category'].apply(final_clean)
df['Cleaned_Sub'] = df['Sub-Category'].apply(final_clean)



In [6]:
main_tokens = sorted(set([item for sublist in df['Cleaned_Main'] for item in sublist]))
sub_tokens = sorted(set([item for sublist in df['Cleaned_Sub'] for item in sublist]))

print("MAIN CATEGORIES:")
for i, m in enumerate(main_tokens, 1):
    print(f"{i}. {m}")

print("\nSUB-CATEGORIES:")
for i, s in enumerate(sub_tokens, 1):
    print(f"{i}. {s}")



MAIN CATEGORIES:
1. cardiovascular system
2. circulatory
3. circulatory system
4. digestive system
5. elimination
6. endocrine
7. endocrine system
8. immune
9. immune system
10. lymphatic
11. lymphatic system
12. musculoskeletal system
13. nervous
14. nervous system
15. reproductive system
16. respiratory
17. respiratory system
18. system
19. urinary system
20. vitality zone

SUB-CATEGORIES:
1. adrenal
2. adrenal exhaustion
3. anxiety
4. arcus senilis
5. arteriosclerosis
6. blood circulation
7. blood circulation problem
8. breast area
9. chest
10. colon
11. emotional stress
12. eye area
13. gallbladder
14. head zone
15. heart
16. hemoglobin
17. hormonal balance
18. hormonal imbalances
19. insulin resistance
20. iris shape
21. knee
22. leg
23. legs
24. liver
25. lower lungs
26. lungs
27. lymph circulation
28. lymphatic rosary
29. medulla–cerebellum region
30. metabolism ring
31. mucous congestion problems
32. neck
33. neurological distress
34. ovaries
35. pancreas
36. pancreas weakness


In [7]:
# Second pass to clean already-cleaned category lists
post_clean_fix_map = {
    'circulatory': 'circulatory system',
    'endocrine': 'endocrine system',
    'nervous': 'nervous system',
    'immune': 'immune system',
    'lymphatic': 'lymphatic system',
    'respiratory': 'respiratory system',
    'system': None,
    'vitality zone': 'vitality',

    # sub-category fix
    'blood circulation problem': 'blood circulation',
    'thyroid glands': 'thyroid',
    'legs': 'leg',
}

def apply_post_clean_fix(category_list):
    fixed = []
    for item in category_list:
        if item in post_clean_fix_map:
            replacement = post_clean_fix_map[item]
            if replacement:
                fixed.append(replacement)
            # skip if None
        else:
            fixed.append(item)
    return list(set(fixed))
df['Cleaned_Main'] = df['Cleaned_Main'].apply(apply_post_clean_fix)
df['Cleaned_Sub'] = df['Cleaned_Sub'].apply(apply_post_clean_fix)
main_tokens = sorted(set([item for sublist in df['Cleaned_Main'] for item in sublist]))
sub_tokens = sorted(set([item for sublist in df['Cleaned_Sub'] for item in sublist]))

print("MAIN CATEGORIES:")
for i, m in enumerate(main_tokens, 1):
    print(f"{i}. {m}")

print("\nSUB-CATEGORIES:")
for i, s in enumerate(sub_tokens, 1):
    print(f"{i}. {s}")


MAIN CATEGORIES:
1. cardiovascular system
2. circulatory system
3. digestive system
4. elimination
5. endocrine system
6. immune system
7. lymphatic system
8. musculoskeletal system
9. nervous system
10. reproductive system
11. respiratory system
12. urinary system
13. vitality

SUB-CATEGORIES:
1. adrenal
2. adrenal exhaustion
3. anxiety
4. arcus senilis
5. arteriosclerosis
6. blood circulation
7. breast area
8. chest
9. colon
10. emotional stress
11. eye area
12. gallbladder
13. head zone
14. heart
15. hemoglobin
16. hormonal balance
17. hormonal imbalances
18. insulin resistance
19. iris shape
20. knee
21. leg
22. liver
23. lower lungs
24. lungs
25. lymph circulation
26. lymphatic rosary
27. medulla–cerebellum region
28. metabolism ring
29. mucous congestion problems
30. neck
31. neurological distress
32. ovaries
33. pancreas
34. pancreas weakness
35. pineal gland
36. pituitary gland
37. prostate
38. rectum
39. skin
40. sluggish liver
41. small intestine
42. spinal cord
43. spine
44.

In [8]:
df[['Image ID', 'Cleaned_Main', 'Cleaned_Sub']].head()
# Combine Cleaned_Main and Cleaned_Sub into a single list per row
df['All_Labels'] = df['Cleaned_Main'] + df['Cleaned_Sub']
df['All_Labels'] = df['All_Labels'].apply(lambda x: list(set(x)))  # remove duplicates


In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import train_test_split

mlb = MultiLabelBinarizer()
label_matrix = mlb.fit_transform(df['All_Labels'])

# Split into train and test sets (70% train, 30% test)
df_train, df_test, y_train, y_test = train_test_split(
    df, label_matrix, test_size=0.3, random_state=42
)


import pickle

with open("mlb.pkl", "wb") as f:
    pickle.dump(mlb, f)

# Store the class names (useful later for prediction decoding)
all_classes = mlb.classes_

# Optional: convert to DataFrame for easier inspection
label_df = pd.DataFrame(label_matrix, columns=all_classes)


In [10]:
label_df


Unnamed: 0,adrenal,adrenal exhaustion,anxiety,arcus senilis,arteriosclerosis,blood circulation,breast area,cardiovascular system,chest,circulatory system,...,stress response,stress ring,thyroid,tonsils,toxic accumulation in lung,transverse colon,urethra,urinary system,vein and lymph congestion,vitality
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
class IrisImageDataset(Dataset):
    def __init__(self, df, label_matrix, image_dir, transform=None):
        self.df = df.reset_index(drop=True)  # image metadata
        self.labels = torch.tensor(label_matrix, dtype=torch.float32)  # multi-labels
        self.image_dir = image_dir  # path to image folder
        self.transform = transform  # preprocessing like resize, normalize

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # 1. Get image ID and build path
        image_id = str(self.df.loc[idx, 'Image ID'])  # e.g., '3'
        image_path = os.path.join(self.image_dir, f"{image_id}.jpg")  # e.g., images/3.jpg

        # 2. Load image
        image = Image.open(image_path).convert("RGB")  # Always 3 channels

        # 3. Apply transformations (resize, normalize, etc.)
        if self.transform:
            image = self.transform(image)

        # 4. Get label vector
        label = self.labels[idx]

        return image, label


In [12]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224
    transforms.ToTensor(),          # Convert to PyTorch tensor [0, 1]
])


In [13]:
# Path to your image folder
image_folder_path = "images"  # Change if your folder is somewhere else

# Create separate datasets
train_dataset = IrisImageDataset(df_train, y_train, image_dir=image_folder_path, transform=transform)
test_dataset  = IrisImageDataset(df_test,  y_test,  image_dir=image_folder_path, transform=transform)

# Create loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=8, shuffle=False)



In [14]:
for images, labels in train_loader:
    print("TRAIN batch shape:", images.shape)
    print("TRAIN label shape:", labels.shape)
    break


TRAIN batch shape: torch.Size([8, 3, 224, 224])
TRAIN label shape: torch.Size([8, 67])


In [15]:
for images, labels in test_loader:
    print("TEST batch shape:", images.shape)
    print("TEST label shape:", labels.shape)
    break


TEST batch shape: torch.Size([6, 3, 224, 224])
TEST label shape: torch.Size([6, 67])


### Model training 

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models


In [17]:
num_classes = 67  # total number of labels

# Load pre-trained ResNet18
model = models.resnet18(pretrained=True)

# Replace the final fully connected layer
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)




In [18]:
# Binary Cross-Entropy with Logits is perfect for multi-label
criterion = nn.BCEWithLogitsLoss()

# Optimizer: Adam is simple and works well
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [19]:
num_epochs = 5  # you can increase this later

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for images, labels in train_loader:  # ← changed this line
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")



Epoch [1/5], Loss: 0.6563
Epoch [2/5], Loss: 0.3323
Epoch [3/5], Loss: 0.1869
Epoch [4/5], Loss: 0.1142
Epoch [5/5], Loss: 0.0872


In [20]:
torch.save(model.state_dict(), "multi_label_resnet18.pth")


## Testing model

In [21]:
# Reload model architecture
model = models.resnet18(pretrained=False)
model.fc = nn.Linear(model.fc.in_features, 67)
model.load_state_dict(torch.load("multi_label_resnet18.pth"))
model = model.to(device)
model.eval()




ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [22]:
def predict_image(image_path, model, mlb, transform, threshold=0.5):
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(device)  # Add batch dimension

    with torch.no_grad():
        output = model(image)
        probs = torch.sigmoid(output).squeeze().cpu().numpy()

    # Get labels where probability > threshold
    predicted_indices = [i for i, p in enumerate(probs) if p > threshold]
    predicted_labels = [mlb.classes_[i] for i in predicted_indices]

    return predicted_labels


In [23]:
sample_path = "images/3.jpg"  # or any image you want to test
predicted_labels = predict_image(sample_path, model, mlb, transform)
print("Predicted Labels:", predicted_labels)


Predicted Labels: ['endocrine system', 'insulin resistance', 'nervous system', 'neurological distress', 'pancreas weakness', 'respiratory system', 'toxic accumulation in lung']


In [24]:
from sklearn.metrics import classification_report


In [25]:
y_true = []
y_pred = []

model.eval()
with torch.no_grad():
    for images, labels in test_loader:  # ← changed this line
        images = images.to(device)
        outputs = model(images)

        probs = torch.sigmoid(outputs).cpu().numpy()
        preds = (probs > 0.5).astype(int)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds)



In [26]:
# Turn into arrays for sklearn
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Create the report
report = classification_report(
    y_true, y_pred,
    target_names=mlb.classes_,
    zero_division=0  # avoid warning when no positive prediction
)

print(report)


                            precision    recall  f1-score   support

                   adrenal       0.00      0.00      0.00         0
        adrenal exhaustion       0.00      0.00      0.00         0
                   anxiety       0.00      0.00      0.00         2
             arcus senilis       0.00      0.00      0.00         1
          arteriosclerosis       0.00      0.00      0.00         1
         blood circulation       0.00      0.00      0.00         0
               breast area       0.00      0.00      0.00         0
     cardiovascular system       0.00      0.00      0.00         1
                     chest       0.00      0.00      0.00         1
        circulatory system       0.00      0.00      0.00         3
                     colon       0.00      0.00      0.00         0
          digestive system       0.33      1.00      0.50         2
               elimination       0.00      0.00      0.00         0
          emotional stress       0.00      0.00

### Evaluation

In [27]:
from sklearn.metrics import classification_report

# --- Train Set Evaluation ---
y_true_train = []
y_pred_train = []

model.eval()
with torch.no_grad():
    for images, labels in train_loader:
        images = images.to(device)
        outputs = model(images)
        probs = torch.sigmoid(outputs).cpu().numpy()
        preds = (probs > 0.5).astype(int)

        y_true_train.extend(labels.cpu().numpy())
        y_pred_train.extend(preds)

# Create report
train_report = classification_report(
    np.array(y_true_train), np.array(y_pred_train),
    target_names=mlb.classes_,
    zero_division=0
)

print("\n🟢 TRAINING SET PERFORMANCE:\n")
print(train_report)



🟢 TRAINING SET PERFORMANCE:

                            precision    recall  f1-score   support

                   adrenal       1.00      1.00      1.00         1
        adrenal exhaustion       1.00      1.00      1.00         1
                   anxiety       1.00      1.00      1.00         1
             arcus senilis       0.00      0.00      0.00         0
          arteriosclerosis       0.00      0.00      0.00         0
         blood circulation       1.00      1.00      1.00         2
               breast area       1.00      1.00      1.00         1
     cardiovascular system       1.00      1.00      1.00         1
                     chest       0.00      0.00      0.00         0
        circulatory system       1.00      1.00      1.00         5
                     colon       1.00      1.00      1.00         1
          digestive system       1.00      1.00      1.00         8
               elimination       1.00      1.00      1.00         1
          emotion

In [28]:
# y_true_test, y_pred_test → from test_loader
test_report = classification_report(
    np.array(y_true), np.array(y_pred),
    target_names=mlb.classes_,
    zero_division=0
)

print("\n🔵 TEST SET PERFORMANCE:\n")
print(test_report)



🔵 TEST SET PERFORMANCE:

                            precision    recall  f1-score   support

                   adrenal       0.00      0.00      0.00         0
        adrenal exhaustion       0.00      0.00      0.00         0
                   anxiety       0.00      0.00      0.00         2
             arcus senilis       0.00      0.00      0.00         1
          arteriosclerosis       0.00      0.00      0.00         1
         blood circulation       0.00      0.00      0.00         0
               breast area       0.00      0.00      0.00         0
     cardiovascular system       0.00      0.00      0.00         1
                     chest       0.00      0.00      0.00         1
        circulatory system       0.00      0.00      0.00         3
                     colon       0.00      0.00      0.00         0
          digestive system       0.33      1.00      0.50         2
               elimination       0.00      0.00      0.00         0
          emotional s