In [3]:
pip install pandas requests pillow pytesseract opencv-python-headless numpy


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting opencv-python-headless
  Downloading opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl (38.8 MB)
   ---------------------------------------- 0.0/38.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/38.8 MB 2.0 MB/s eta 0:00:20
   ---------------------------------------- 0.2/38.8 MB 2.5 MB/s eta 0:00:16
   ---------------------------------------- 0.3/38.8 MB 2.6 MB/s eta 0:00:15
   ---------------------------------------- 0.5/38.8 MB 2.7 MB/s eta 0:00:15
    --------------------------------------- 0.6/38.8 MB 2.7 MB/s eta 0:00:15
    --------------------------------------- 0.7/38.8 MB 2.8 MB/s eta 0:00:14
    --------------------------------------- 0.9/38.8 MB 2.8 MB/s eta 0:00:14
   - -------------------------------------- 

In [19]:
pip install easyocr pandas scikit-learn torch transformers


Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting torch
  Downloading torch-2.4.1-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.7 kB ? eta -:--:--
     ---------------------------------------- 43.7/43.7 kB 2.2 MB/s eta 0:00:00
Collecting torchvision>=0.5 (from easyocr)
  Downloading torchvision-0.19.1-cp312-cp312-win_amd64.whl.metadata (6.1 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.0-cp312-none-win_amd64.whl.metadata (4.7 kB)
Collecting Shapely (from easyocr)
  Downloading shapely-2.0.6-cp312-cp312-win_amd64.whl.metadata (7.2 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp312-cp312-win_amd64.whl.metadata (9.2 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-win_amd64.whl.metadata (5.4 kB)
Collecting huggingface-hub<1

In [21]:
import easyocr
import pandas as pd
import os
import re
import logging
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize EasyOCR Reader
reader = easyocr.Reader(['en'])  # English language

def extract_text(image_path):
    """
    Extract text from an image using EasyOCR.
    """
    try:
        logging.info(f"Extracting text from {image_path}")
        result = reader.readtext(image_path, detail=0)
        extracted_text = " ".join(result)  # Combine all detected text
        logging.info(f"Extracted text: {extracted_text[:100]}...")  # Log first 100 characters
        return extracted_text
    except Exception as e:
        logging.error(f"Error extracting text from {image_path}: {str(e)}")
        return ""

def clean_text(ocr_text, entity_name):
    """
    Clean OCR text to match the expected format and correct non-standard representations.
    """
    logging.info(f"Cleaning text for entity: {entity_name}")
    ocr_text = ocr_text.lower()  # Convert text to lowercase for uniformity
    
    # Replace common non-standard units with standard ones
    ocr_text = re.sub(r"\b(gms|grams)\b", "gram", ocr_text)
    ocr_text = re.sub(r"\b(kgs|kilograms)\b", "kilogram", ocr_text)
    ocr_text = re.sub(r"\b(lbs|pounds)\b", "pound", ocr_text)
    ocr_text = re.sub(r"\b(cm|centimeters)\b", "centimeter", ocr_text)
    ocr_text = re.sub(r"\binches?\b", "inch", ocr_text)
    ocr_text = re.sub(r"\blitres?|ltrs?\b", "litre", ocr_text)
    
    # Match entity with allowed units and extract the value
    value = None
    for unit in entity_unit_map.get(entity_name, []):
        match = re.search(rf"(\d+(\.\d+)?)\s*{unit}", ocr_text)
        if match:
            value = f"{match.group(1)} {unit}"
            break
    
    logging.info(f"Cleaned value: {value}")
    return value if value else ""

class CustomDataset(Dataset):
    def _init_(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def _len_(self):
        return len(self.texts)

    def _getitem_(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # Tokenization and padding here
        tokens = self.tokenizer(text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = tokens['input_ids'].squeeze()
        attention_mask = tokens['attention_mask'].squeeze()
        return input_ids, attention_mask, label

class MLPModel(nn.Module):
    def _init_(self, input_dim, hidden_dim, output_dim):
        super(MLPModel, self)._init_()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        # BERT model to extract features
        with torch.no_grad():
            outputs = bert_model(input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token embedding
        x = self.relu(self.fc1(last_hidden_state))
        x = self.fc2(x)
        return self.softmax(x)

def preprocess_images(image_folder, dataset):
    """
    Preprocess images by extracting and cleaning OCR output for each row in the dataset.
    """
    predictions = []
    
    for index, row in dataset.iterrows():
        logging.info(f"Processing row {index + 1}/{len(dataset)}")
        image_path = os.path.join(image_folder, row['image_link'].split('/')[-1])
        
        if not os.path.exists(image_path):
            logging.error(f"Image not found: {image_path}")
            predictions.append("")
            continue
        
        # Apply OCR and clean the text
        extracted_text = extract_text(image_path)
        cleaned_value = clean_text(extracted_text, row['entity_name'])
        
        predictions.append(cleaned_value)
    
    return predictions

try:
    # Load datasets
    logging.info("Loading training data")
    train_df = pd.read_csv("dataset/train.csv")

    # Process training data
    logging.info("Preprocessing training images")
    train_predictions = preprocess_images("images", train_df)
    train_df['predictions'] = train_predictions

    # Prepare data for MLP
    logging.info("Preparing data for MLP")
    texts = train_df['predictions'].values
    labels = train_df['ground_truth'].values
    
    le = LabelEncoder()
    labels = le.fit_transform(labels)  # Encode labels to integers

    # Train-test split
    X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)
    
    # Tokenizer setup
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')  # Initialize BERT model

    # Create dataset and dataloader
    train_dataset = CustomDataset(X_train, y_train, tokenizer=tokenizer, max_len=512)
    val_dataset = CustomDataset(X_val, y_val, tokenizer=tokenizer, max_len=512)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Initialize model
    input_dim = 768  # BERT hidden dimension
    hidden_dim = 50
    output_dim = len(le.classes_)
    model = MLPModel(input_dim, hidden_dim, output_dim)

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train model
    logging.info("Training MLP model")
    for epoch in range(10):  # Number of epochs
        model.train()
        for input_ids, attention_mask, labels in train_loader:
            input_ids = input_ids
            attention_mask = attention_mask
            labels = torch.tensor(labels)

            # Forward pass
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        logging.info(f"Epoch {epoch + 1}: Loss = {loss.item()}")

    # Evaluate model
    model.eval()
    val_preds = []
    val_true = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_loader:
            input_ids = input_ids
            attention_mask = attention_mask
            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            val_preds.extend(predicted.numpy())
            val_true.extend(labels.numpy())

    val_f1 = f1_score(val_true, val_preds, average='weighted')
    logging.info(f"Validation F1 Score: {val_f1}")

except Exception as e:
    logging.error(f"An error occurred: {str(e)}")

Progress: |██████████████████████████████████████████████████| 100.0% Complete

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)
