In [None]:
!pip install torch torchvision easyocr opencv-python-headless

Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.2-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.2-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (422 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.9/422.9 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

# Set paths for Drive
DRIVE_BASE_PATH = '/content/drive/Shareddrives/avidea'
TRAIN_DIR = os.path.join(DRIVE_BASE_PATH, 'Train')
TEST_DIR = os.path.join(DRIVE_BASE_PATH, 'Test_data')
OUTPUT_FILE = os.path.join(DRIVE_BASE_PATH, 'submission.csv')

# Ensure directories exist
os.makedirs(DRIVE_BASE_PATH, exist_ok=True)

# Optional: Check GPU availability
import torch
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

GPU Available: True
GPU Name: Tesla T4


#ViT + data augmentation + OCR (Multi-Modal approch)

In [None]:
import os
import torch
import pandas as pd
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.models import vit_b_16, ViT_B_16_Weights
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from PIL import Image
from tqdm import tqdm
import easyocr
import cv2
import numpy as np

# Configuration
# TRAIN_DIR = '/kaggle/input/avidea-student-challenge/Train'
# TEST_DIR = '/kaggle/input/avidea-student-challenge/Test'
# OUTPUT_FILE = 'submission.csv'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
EPOCHS = 10

class LicensePlateEnhancer:
    def __init__(self):
        # Initialize EasyOCR reader for Arabic script
        self.reader = easyocr.Reader(['ar'])

    def preprocess_for_ocr(self, image):
        """
        Preprocess image for better license plate detection
        """
        # Convert to grayscale
        gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)

        # Apply adaptive thresholding
        thresh = cv2.adaptiveThreshold(
            gray, 255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, 11, 2
        )

        return thresh

    def extract_license_plate(self, image):
        """
        Attempt to extract and read license plate
        """
        # Preprocess image
        preprocessed = self.preprocess_for_ocr(image)

        # Convert back to PIL Image for EasyOCR
        ocr_image = Image.fromarray(preprocessed)

        # Perform OCR
        results = self.reader.readtext(np.array(ocr_image))

        # Filter and process results
        license_plates = [
            result[1] for result in results
            if len(result[1]) > 5  # Basic filtering for potential license plate text
        ]

        return license_plates[0] if license_plates else None

# Data Transformations
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),  # Additional data augmentation
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

# Load Training Data
train_data = torchvision.datasets.ImageFolder(TRAIN_DIR, transform=train_transforms)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

# Load Test Data
test_files = [os.path.join(TEST_DIR, f) for f in os.listdir(TEST_DIR) if f.endswith(('.png', '.jpg', '.jpeg'))]

# Initialize License Plate Enhancer
plate_recognizer = LicensePlateEnhancer()

# Vision Transformer Model
weights = ViT_B_16_Weights.DEFAULT
model = vit_b_16(weights=weights)

# Adapt the head for the number of classes
num_classes = len(train_data.classes)
in_features = model.heads.head.in_features
model.heads.head = torch.nn.Linear(in_features, num_classes)

# Move model to device
model = model.to(DEVICE)

# Optimizer and Loss Function
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
criterion = CrossEntropyLoss()

# Training Loop
def train_model():
    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            images, labels = images.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {running_loss:.4f}, Accuracy: {accuracy:.2f}%")

# Prediction Function
def predict_with_ocr():
    model.eval()
    predictions = []

    with torch.no_grad():
        for file_path in tqdm(test_files, desc="Predicting"):
            # Open image
            image = Image.open(file_path).convert("RGB")

            # Try to extract license plate first
            license_plate = plate_recognizer.extract_license_plate(image)

            if license_plate:
                # If license plate found, search for matching class
                matching_classes = [
                    cls for cls in train_data.classes
                    if license_plate in os.listdir(os.path.join(TRAIN_DIR, cls))
                ]

                if matching_classes:
                    predicted_class = matching_classes[0]
                else:
                    # Fallback to Vision Transformer prediction
                    tensor_image = test_transforms(image).unsqueeze(0).to(DEVICE)
                    outputs = model(tensor_image)
                    _, predicted_class_idx = torch.max(outputs, 1)
                    predicted_class = train_data.classes[predicted_class_idx.item()]
            else:
                # No license plate found, use Vision Transformer
                tensor_image = test_transforms(image).unsqueeze(0).to(DEVICE)
                outputs = model(tensor_image)
                _, predicted_class_idx = torch.max(outputs, 1)
                predicted_class = train_data.classes[predicted_class_idx.item()]

            predictions.append((os.path.basename(file_path), predicted_class))

    return predictions

# Main Execution
def main():
    # Train the model
    train_model()

    # Make predictions
    predictions = predict_with_ocr()

    # Create submission file
    submission = pd.DataFrame(predictions, columns=["Id", "class"])
    submission.to_csv(OUTPUT_FILE, index=False)

    print(f"Submission saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:03<00:00, 112MB/s]
Epoch 1/10: 100%|██████████| 53/53 [09:20<00:00, 10.58s/it]


Epoch 1/10, Loss: 301.3429, Accuracy: 1.25%


Epoch 2/10: 100%|██████████| 53/53 [01:40<00:00,  1.89s/it]


Epoch 2/10, Loss: 240.8746, Accuracy: 20.13%


Epoch 3/10: 100%|██████████| 53/53 [01:38<00:00,  1.85s/it]


Epoch 3/10, Loss: 146.5671, Accuracy: 71.26%


Epoch 4/10: 100%|██████████| 53/53 [01:42<00:00,  1.93s/it]


Epoch 4/10, Loss: 76.8532, Accuracy: 94.30%


Epoch 5/10: 100%|██████████| 53/53 [01:39<00:00,  1.88s/it]


Epoch 5/10, Loss: 33.9659, Accuracy: 99.05%


Epoch 6/10: 100%|██████████| 53/53 [01:41<00:00,  1.91s/it]


Epoch 6/10, Loss: 14.3978, Accuracy: 99.76%


Epoch 7/10: 100%|██████████| 53/53 [01:39<00:00,  1.88s/it]


Epoch 7/10, Loss: 6.8420, Accuracy: 99.88%


Epoch 8/10: 100%|██████████| 53/53 [01:40<00:00,  1.89s/it]


Epoch 8/10, Loss: 4.0881, Accuracy: 99.88%


Epoch 9/10: 100%|██████████| 53/53 [01:39<00:00,  1.87s/it]


Epoch 9/10, Loss: 2.9812, Accuracy: 99.88%


Epoch 10/10: 100%|██████████| 53/53 [01:40<00:00,  1.89s/it]


Epoch 10/10, Loss: 2.3642, Accuracy: 99.88%


Predicting: 100%|██████████| 511/511 [05:48<00:00,  1.47it/s]

Submission saved to /content/drive/Shareddrives/avidea/submission.csv





#ResNet50 + OCR + Data augmentation

In [None]:
!pip install albumentations



In [None]:
import os
import torch
import pandas as pd
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision import models
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from PIL import Image
from tqdm import tqdm
import easyocr
import cv2
import numpy as np

SEED = 42
DRIVE_BASE_PATH = '/content/drive/Shareddrives/avidea'
TRAIN_DIR = os.path.join(DRIVE_BASE_PATH, 'Train')
TEST_DIR = os.path.join(DRIVE_BASE_PATH, 'Test_data')
OUTPUT_FILE = os.path.join(DRIVE_BASE_PATH, 'submission4.csv')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 16
LEARNING_RATE = 1e-4
EPOCHS = 10

class LicensePlateEnhancer:
    def __init__(self):
        # Initialize EasyOCR reader for Arabic script
        self.reader = easyocr.Reader(['ar'])

    def preprocess_for_ocr(self, image):
        """
        Preprocess image for better license plate detection
        """
        # Convert to grayscale
        gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)

        # Apply adaptive thresholding
        thresh = cv2.adaptiveThreshold(
            gray, 255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, 11, 2
        )

        return thresh

    def extract_license_plate(self, image):
        """
        Attempt to extract and read license plate
        """
        # Preprocess image
        preprocessed = self.preprocess_for_ocr(image)

        # Convert back to PIL Image for EasyOCR
        ocr_image = Image.fromarray(preprocessed)

        # Perform OCR
        results = self.reader.readtext(np.array(ocr_image))

        # Filter and process results
        license_plates = [
            result[1] for result in results
            if len(result[1]) > 5  # Basic filtering for potential license plate text
        ]

        return license_plates[0] if license_plates else None

# Data Transformations
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),  # Additional data augmentation
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

# Load Training Data
train_data = torchvision.datasets.ImageFolder(TRAIN_DIR, transform=train_transforms)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

# Load Test Data
test_files = [os.path.join(TEST_DIR, f) for f in os.listdir(TEST_DIR) if f.endswith(('.png', '.jpg', '.jpeg'))]

# Initialize License Plate Enhancer
plate_recognizer = LicensePlateEnhancer()

# ResNet-50 Model
model = models.resnet50(pretrained=True)

# Adapt the head for the number of classes
num_classes = len(train_data.classes)
in_features = model.fc.in_features
model.fc = torch.nn.Linear(in_features, num_classes)

# Move model to device
model = model.to(DEVICE)

# Optimizer and Loss Function
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
criterion = CrossEntropyLoss()

# Training Loop
def train_model():
    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            images, labels = images.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {running_loss:.4f}, Accuracy: {accuracy:.2f}%")

# Prediction Function
def predict_with_ocr():
    model.eval()
    predictions = []

    with torch.no_grad():
        for file_path in tqdm(test_files, desc="Predicting"):
            # Open image
            image = Image.open(file_path).convert("RGB")

            # Try to extract license plate first
            license_plate = plate_recognizer.extract_license_plate(image)

            if license_plate:
                # If license plate found, search for matching class
                matching_classes = [
                    cls for cls in train_data.classes
                    if license_plate in os.listdir(os.path.join(TRAIN_DIR, cls))
                ]

                if matching_classes:
                    predicted_class = matching_classes[0]
                else:
                    # Fallback to ResNet-50 prediction
                    tensor_image = test_transforms(image).unsqueeze(0).to(DEVICE)
                    outputs = model(tensor_image)
                    _, predicted_class_idx = torch.max(outputs, 1)
                    predicted_class = train_data.classes[predicted_class_idx.item()]
            else:
                # No license plate found, use ResNet-50
                tensor_image = test_transforms(image).unsqueeze(0).to(DEVICE)
                outputs = model(tensor_image)
                _, predicted_class_idx = torch.max(outputs, 1)
                predicted_class = train_data.classes[predicted_class_idx.item()]

            predictions.append((os.path.basename(file_path), predicted_class))

    return predictions

# Main Execution
def main():
    # Train the model
    train_model()

    # Make predictions
    predictions = predict_with_ocr()

    # Create submission file
    submission = pd.DataFrame(predictions, columns=["Id", "class"])
    submission.to_csv(OUTPUT_FILE, index=False)

    print(f"Submission saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 150MB/s]
Epoch 1/10: 100%|██████████| 53/53 [00:59<00:00,  1.13s/it]


Epoch 1/10, Loss: 300.3916, Accuracy: 3.09%


Epoch 2/10: 100%|██████████| 53/53 [00:57<00:00,  1.08s/it]


Epoch 2/10, Loss: 243.8261, Accuracy: 17.04%


Epoch 3/10: 100%|██████████| 53/53 [00:57<00:00,  1.08s/it]


Epoch 3/10, Loss: 191.5872, Accuracy: 41.45%


Epoch 4/10: 100%|██████████| 53/53 [00:57<00:00,  1.08s/it]


Epoch 4/10, Loss: 143.0513, Accuracy: 61.34%


Epoch 5/10: 100%|██████████| 53/53 [00:56<00:00,  1.07s/it]


Epoch 5/10, Loss: 100.0921, Accuracy: 79.28%


Epoch 6/10: 100%|██████████| 53/53 [00:56<00:00,  1.07s/it]


Epoch 6/10, Loss: 66.8191, Accuracy: 89.96%


Epoch 7/10: 100%|██████████| 53/53 [00:56<00:00,  1.07s/it]


Epoch 7/10, Loss: 42.3251, Accuracy: 96.02%


Epoch 8/10: 100%|██████████| 53/53 [00:56<00:00,  1.07s/it]


Epoch 8/10, Loss: 25.2716, Accuracy: 98.75%


Epoch 9/10: 100%|██████████| 53/53 [00:57<00:00,  1.09s/it]


Epoch 9/10, Loss: 15.4374, Accuracy: 99.52%


Epoch 10/10: 100%|██████████| 53/53 [00:56<00:00,  1.07s/it]


Epoch 10/10, Loss: 9.8594, Accuracy: 99.70%


Predicting: 100%|██████████| 511/511 [03:27<00:00,  2.47it/s]

Submission saved to /content/drive/Shareddrives/avidea/submission4.csv



