In [None]:
from glob import glob
from sklearn.model_selection import train_test_split

In [None]:
'dataset-resized/paper/paper502.jpg'.split('/')[-1]

'paper502.jpg'

In [None]:
!rm -rf data
!mkdir data
!mkdir data/train
!mkdir data/test
!mkdir data/valid

for folder_name in glob('drive/MyDrive/dataset-resized/*'):
    category_name = folder_name.split('/')[-1]
    !mkdir data/train/{category_name}
    !mkdir data/test/{category_name}
    !mkdir data/valid/{category_name}

In [None]:
train_size = 0.8
test_size = 0.1
validation_size = 0.1


def move_files(full_string, set_name):
    image_name = full_string.split('/')[-1]
    file_location = f"data/{set_name}/{category_name}/{image_name}"
    !ln -s $(pwd)/{full_string} $(pwd)/{file_location}


for folder_name in glob('drive/MyDrive/dataset-resized/*'):
    file_names = glob(f"{folder_name}/*")
    train, proto_test = train_test_split(file_names, train_size=0.8)
    test, valid = train_test_split(proto_test, train_size=0.5)
    category_name = folder_name.split('/')[-1]
    print(f"{category_name} train: {len(train)}")
    print(f"{category_name} test: {len(test)}")
    for tr_filename_full_string in train:
        move_files(tr_filename_full_string, 'train')
    for tr_filename_full_string in valid:
        move_files(tr_filename_full_string, 'valid')
    for tr_filename_full_string in test:
        move_files(tr_filename_full_string, 'test')

plastic train: 385
plastic test: 48
glass train: 400
glass test: 50
metal train: 328
metal test: 41
cardboard train: 322
cardboard test: 40
trash train: 109
trash test: 14
paper train: 475
paper test: 59


In [None]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image

In [None]:
# Colab: install CLIP
!pip -q install git+https://github.com/openai/CLIP.git

import os, torch, numpy as np
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)  # CLIP image encoder
model.eval()


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for clip (setup.py) ... [?25l[?25hdone


100%|███████████████████████████████████████| 338M/338M [00:06<00:00, 54.4MiB/s]


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [None]:
train_root = "/content/data/train"   # update path if needed
test_root  = "/content/data/test"

train_ds = ImageFolder(train_root, transform=preprocess)
test_ds  = ImageFolder(test_root,  transform=preprocess)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

class_names = train_ds.classes
num_classes = len(class_names)
print(class_names)


['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']


In [None]:
@torch.no_grad()
def embed_split(data_loader):
    all_feats, all_labels = [], []
    for imgs, labels in data_loader:
        imgs = imgs.to(device, non_blocking=True)
        feats = model.encode_image(imgs)                 # [B, D]
        feats = feats / feats.norm(dim=-1, keepdim=True) # normalize (recommended)
        all_feats.append(feats.cpu())
        all_labels.append(labels)
    X = torch.cat(all_feats).float()
    y = torch.cat(all_labels).long()
    return X, y

X_tr, y_tr = embed_split(train_loader)
X_te, y_te = embed_split(test_loader)

torch.save({"X": X_tr, "y": y_tr, "classes": class_names}, "train_emb.pt")
torch.save({"X": X_te, "y": y_te, "classes": class_names}, "test_emb.pt")

in_dim = X_tr.shape[1]  # embedding dimension (e.g., 512 for ViT-B/32)




In [None]:
# Simple MLP head
mlp = nn.Sequential(
    nn.Linear(in_dim, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, num_classes)
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(mlp.parameters(), lr=2e-3, weight_decay=1e-2)

def to_loader(X, y, batch_size=256, shuffle=True):
    ds = torch.utils.data.TensorDataset(X, y)
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle)

train_emb_loader = to_loader(X_tr, y_tr)
test_emb_loader  = to_loader(X_te, y_te, shuffle=False)

best_acc = 0.0
for epoch in range(20):
    mlp.train()
    for xb, yb in train_emb_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(mlp(xb), yb)
        loss.backward()
        optimizer.step()

    # Eval
    mlp.eval()
    correct = total = 0
    with torch.no_grad():
        for xb, yb in test_emb_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = mlp(xb).argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.numel()
    acc = correct / total
    print(f"Epoch {epoch+1}: test acc={acc:.4f}")


Epoch 1: test acc=0.6349
Epoch 2: test acc=0.7103
Epoch 3: test acc=0.7937
Epoch 4: test acc=0.8492
Epoch 5: test acc=0.8532
Epoch 6: test acc=0.8651
Epoch 7: test acc=0.8690
Epoch 8: test acc=0.8849
Epoch 9: test acc=0.9008
Epoch 10: test acc=0.9087
Epoch 11: test acc=0.9048
Epoch 12: test acc=0.9206
Epoch 13: test acc=0.9127
Epoch 14: test acc=0.9206
Epoch 15: test acc=0.9167
Epoch 16: test acc=0.9206
Epoch 17: test acc=0.9246
Epoch 18: test acc=0.9206
Epoch 19: test acc=0.9246
Epoch 20: test acc=0.9206


In [None]:
from PIL import Image

@torch.no_grad()
def predict_image(path):
    img = preprocess(Image.open(path)).unsqueeze(0).to(device)
    feat = model.encode_image(img)
    feat = feat / feat.norm(dim=-1, keepdim=True)
    logits = mlp(feat)
    idx = logits.argmax(dim=1).item()
    return idx, class_names[idx]

# Example:
idx, name = predict_image("/content/SCR-20251027-osvn.jpeg")
print(idx, name)


0 cardboard


In [None]:
# After training your MLP head on CLIP embeddings
ckpt = {
    "mlp_state": mlp.state_dict(),
    "classes": class_names,
    "clip_name": "ViT-B/32",
    "in_dim": X_tr.shape[1],      # e.g., 512
    "mlp_hidden": 256
}
torch.save(ckpt, "recycler_mlp.pth")  # common .pth/.pt extension [web:22]


In [None]:

#Encode an image:
img_emb = model.encode(Image.open('SCR-20251019-ixfs.jpeg'))

text_emb = model.encode(['Coke trash can of red color', 'paper trash of any shape and size', 'Glass trash of any size and surface'])

#Compute cosine similarities
cos_scores = util.cos_sim(img_emb, text_emb)
print(cos_scores)

print(img_emb.shape)

tensor([[0.3216, 0.2486, 0.2348]])
(512,)


In [None]:
path = "drive/MyDrive/dataset-resized/"

These are the different fields for the Model to predict,
- cardboard  
- glass  
- metal  
- paper
- plastic  
- trash
1. Let us take the embeddings of each section and then, pass that to a DNN
2. We can use a softmax function at the end layer to predict which of the 5 categories is the input image.

In [None]:
# PyTorch: dataset, model, training loop
import torch, torch.nn as nn, torch.optim as optim
from torchvision import datasets, transforms

# Paths: main/cardboard, main/glass, main/metal, main/paper, main/plastic, main/trash
train_tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])
train_ds = datasets.ImageFolder(root="data/train", transform=train_tfms)  # folders define targets
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=64, shuffle=True)

# Determine flattened input dimension (H*W*C = 224*224*3 here)
in_dim = 224*224*3

model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(in_dim, 512),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(512, 6)  # logits for 6 classes
)

criterion = nn.CrossEntropyLoss()        # expects integer class indices (0..5)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

model.train()
for epoch in range(10):
    for x, y in train_loader:            # y are class indices from folder names
        logits = model(x)
        loss = criterion(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Inference: probs = logits.softmax(dim=1)


In [None]:
model.save('embeddings_model.h5')

AttributeError: 'Sequential' object has no attribute 'save'