In [35]:
#!pip install pytesseract
#!pip install easyocr

In [69]:
import re
#import constants
import os
import pandas as pd
import multiprocessing
import time
from time import time as timer
#from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
#import requests
import urllib
from PIL import Image
import easyocr
import cv2

In [70]:
def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except Exception as e:
            print(e)
            time.sleep(delay)
    
    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images


In [71]:
def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return


In [72]:
# STEP 1 : Normalization
# Normalization of the image without adaptive threshold

def step_1(image_path):
    try:
        # Read color image
        image = cv2.imread(image_path)

        # Normalizing the color image (keeping 3 channels)
        normalized_color_image = cv2.normalize(
            image, 
            None, 
            alpha=0, 
            beta=255, 
            norm_type=cv2.NORM_MINMAX)

        # Writing the output back to the input file source
        cv2.imwrite(image_path, normalized_color_image)

    except Exception as e:
        print(e)


In [64]:
# #STEP 3 : IMAGE SCALING (scaling all images to have DPI of 300)
# def set_image_dpi(image_path):
    
#     try : 
#         # Open the image
#         im = Image.open(image_path)

#         # Resize the image
#         length_x, width_y = im.size
#         factor = min(1, float(1024.0 / length_x))
#         size = int(factor * length_x), int(factor * width_y)
#         im_resized = im.resize(size, Image.Resampling.LANCZOS)

#         # Writing output back to the input file source
#         im_resized.save(image_path, dpi=(300, 300))
    
#     except Exception as e :
#         print(e)

In [73]:
# STEP 4 : Noise Removal - removing small dots/patches w high intensity compared to the rest of the image
def remove_noise(image_path):
    # Read the image from the file path
    image = cv2.imread(image_path)
    
    # Check if the image was loaded correctly
    if image is None:
        raise ValueError(f"Image could not be loaded from {image_path}.")
    
    # Apply the denoising function
    denoised_img = cv2.fastNlMeansDenoisingColored(image, None, 4, 0, 11, 19)
    
    # Writing output back to the input file source
    cv2.imwrite(image_path, denoised_img)


In [74]:
# STEP 5 : Grayscale + Threshold
# Using automatic threshold decider to convert an already grayscale image to increase contrast

def thresholding(image_path):
    image = cv2.imread(image_path)  # Read the image
    grayscale_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert the image to grayscale
    cv2.imwrite(image_path, grayscale_img)  # Save the grayscale image
    
    #threshold_img = cv2.threshold(grayscale_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]  # Apply Otsu's threshold
    #cv2.imwrite(image_path, threshold_img)  # Save the thresholded image


In [79]:
# Function to deskew an image
def deskew_image(image_path):
    image = cv2.imread(image_path)
    
    # Apply edge detection (Canny)
    edges = cv2.Canny(image, 50, 150, apertureSize=3)

    # Apply Hough Line Transform to detect lines in the image
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 100, minLineLength=100, maxLineGap=10)

    # Calculate the angle of each line
    angles = []
    
    try :
        for line in lines:
            x1, y1, x2, y2 = line[0]
            angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
            angles.append(angle)

        # Find the median angle (to handle outliers)
        median_angle = np.median(angles)

        # Rotate the image to deskew it
        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
        deskewed_image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

        cv2.imwrite(image_path, deskewed_image)
    except :
        return

In [44]:
# # ADITI CODE

# import spacy

# nlp = spacy.load("en_core_web_sm")

# allowed_units = {
#     'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
#     'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
#     'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
#     'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
#     'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
#     'voltage': {'kilovolt', 'millivolt', 'volt'},
#     'wattage': {'kilowatt', 'watt'},
#     'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 
#                     'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
# }

# unit_prefixes = {
#     'cm': 'centimetre', 'mm': 'millimetre', 'm': 'metre', 'kg': 'kilogram',
#     'g': 'gram', 'mg': 'milligram', 'µg': 'microgram', 'l': 'litre',
#     'ml': 'millilitre', 'cl': 'centilitre', 'dl': 'decilitre', 'µl': 'microlitre',
#     'oz': 'ounce', 'lb': 'pound', 'ft': 'foot', 'in': 'inch', 'yd': 'yard',
#     'gal': 'gallon', 'pt': 'pint', 'qt': 'quart', 'kv': 'kilovolt',
#     'mv': 'millivolt', 'v': 'volt', 'kw': 'kilowatt', 'w': 'watt'
# }

# spl_plurals = {
#     'inches': 'inch', 'feet': 'foot', 'pounds': 'pound', 'ounces': 'ounce',
#     'liters': 'litre', 'cubic feet': 'cubic foot', 'cubic inches': 'cubic inch',
#     'fluid ounces': 'fluid ounce', 'meters': 'metre', 'grams': 'gram'
# }


# def normalize_unit(unit, entity_name):
#     unit = unit.lower()
#     # check for spl plurals, then remove s for normal plurals then give out whatever entity init
#     if unit in spl_plurals:
#         normalized = spl_plurals[unit]
#         if normalized in allowed_units.get(entity_name, set()):
#             return normalized
    
#     singular = unit.rstrip('s')
#     if singular in allowed_units.get(entity_name, set()):
#         return singular
    
#     for prefix, full_form in unit_prefixes.items():
#         if unit.startswith(prefix.lower()) and full_form in allowed_units.get(entity_name, set()):
#             return full_form
    
#     return ""

# # gets number + unit -> gives unit to normalize
# def extract_measurement(text, entity_name):
#     doc = nlp(text)
    
#     quantity = None
#     unit = None
    
#     for token in doc:
#         if token.like_num and not quantity:
#             quantity = token.text
#         elif not unit:
#             unitpart = token.text.lower()
#             normalized_unit = normalize_unit(unitpart, entity_name)
#             if normalized_unit:
#                 unit = normalized_unit
    
    
#     if quantity and unit:
#         result = f"{quantity} {unit}"
#         return result
    
#     return ""

# # # dummy
# # test_cases = [
# #     ([(189, 75), (469, 75), (469, 165), (189, 165)], '200 pounds', 'item_weight'),
# #     ([(100, 100), (200, 100), (200, 200)], '150 inches', 'height'),
# #     ([(50, 50), (150, 50), (150, 150)], '2.5 liters', 'item_volume'),
# #     ([(75, 75), (175, 75), (175, 175)], '500 inch', 'item_volume'), 
# #     ([(25, 25), (125, 25), (125, 125)], '6 ft', 'width'),
# #     ([(10, 10), (110, 10), (110, 110)], '1000 millivolts', 'voltage')
# # ]

# # # Process test cases
# # for bbox, text, entity in test_cases:
# #     prediction = extract_measurement(text, entity)
# #     print(f"Final Prediction: {prediction}")

In [76]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

# Entity-to-unit mappings
entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Common unit prefixes and their full forms
unit_prefixes = {
    'cm': 'centimetre',
    'mm': 'millimetre',
    'm': 'metre',
    'kg': 'kilogram',
    'g': 'gram',
    'mg': 'milligram',
    'µg': 'microgram',
    'l': 'litre',
    'ml': 'millilitre',
    'cl': 'centilitre',
    'dl': 'decilitre',
    'µl': 'microlitre',
    'oz': 'ounce',
    'lb': 'pound',
    'ft': 'foot',
    'in': 'inch',
    'yd': 'yard',
    'gal': 'gallon',
    'pt': 'pint',
    'qt': 'quart',
    'kv': 'kilovolt',
    'mv': 'millivolt',
    'v': 'volt',
    'kw': 'kilowatt',
    'w': 'watt'
}

# Normalize the unit to match the full form and correct plural cases
def normalize_unit(unit, entity_name):
    unit = unit.lower().strip()
    
    # Handle special plural cases
    special_cases = {
        'inches': 'inch',
        'feet': 'foot',
        'pounds': 'pound',
        'ounces': 'ounce',
        'liters': 'litre',
        'cubic feet': 'cubic foot',
        'cubic inches': 'cubic inch',
        'fluid ounces': 'fluid ounce',
        'meters': 'metre',
        'grams': 'gram'
    }
    
    if unit in special_cases:
        normalized_unit = special_cases[unit]
        if normalized_unit in entity_unit_map.get(entity_name, set()):
            return normalized_unit

    # Remove plural 's' and check
    unit = unit.rstrip('s')

    if unit in entity_unit_map.get(entity_name, set()):
        return unit

    # Check for unit prefixes (e.g., 'kg' -> 'kilogram')
    for prefix, full_form in unit_prefixes.items():
        if unit == prefix and full_form in entity_unit_map.get(entity_name, set()):
            return full_form

    return ""

# Extract quantities and units using regex
def extract_units_and_quantities(text):
    # Regex to capture numbers followed by units (with or without spaces)
    matches = re.findall(r'(\d+(?:\.\d+)?)\s*([a-zA-Zµ]+)', text)
    return matches

# Extract quantity and unit for a given entity
def extract_quantity_for_entity(text, entity_name):
    # Apply spacy NLP model to process text
    doc = nlp(text)
    
    # Extract relevant tokens (numbers and units)
    units_and_quantities = extract_units_and_quantities(text)
    
    for quantity, unit in units_and_quantities:
        # Normalize the unit and check if it's valid for the given entity
        normalized_unit = normalize_unit(unit, entity_name)
        if normalized_unit:
            return f"{quantity} {normalized_unit}"
    
    return ""

# Process the text for a specific entity
def process_text(text, entity_name):
    prediction = extract_quantity_for_entity(text, entity_name)
    if prediction:
        return f"{entity_name}: {prediction}"
    return ""


In [85]:
# ORIGINAL WORKING PADDLE-----------------------------------------

import csv
from paddleocr import PaddleOCR, draw_ocr
import logging

count = 0
with open("./dataset/test.csv", 'r') as fh :
    reader = csv.reader(fh)
    count = 0
    for row in reader :
        if count != 0 :
            index, link, product_code, reqd_entity = row[0], row[1], row[2], row[3]
            
            download_image(link, "./images/")
            image_name = link.split("/")[5]
            img_path = './images/' + image_name
            
            print(img_path)
            # Performing the preprocessing
            step_1(img_path)
            set_image_dpi(img_path)
            remove_noise(img_path)
            thresholding(img_path)
            deskew_image(img_path)
    
            logging.getLogger("ppocr").setLevel(logging.ERROR)
            ocr = PaddleOCR(use_gpu = True, use_angle_cls=True, lang='en', show_logs = False) # Initialize PaddleOCR            
            result = ocr.ocr(img_path)  # Perform OCR on the image
            

            # Print the result
#             print(f"\n__________________{img_path}____________________\n")
#             print(f"\n__________________{reqd_entity}____________________\n")
#             print(f"\n__________________{exp_out}____________________\n")
            
            text_input = """"""
            for line in result:
                for word_info in line:
                    word = word_info[1][0]  # Extract the word
                    text_input += word + "\n"                                           
                        
            print(text_input)
            
            # Process the input text for all entities in entity_unit_map and print only relevant ones
            output = process_text(text_input, reqd_entity)
            data = [index, output]
            
            with open("test_out.csv", 'a') as fh :
                writer = csv.writer(fh)
                writer.writerow(data)

#             if output :
#                 print(output)
#             else :
#                 print("NOTHING")
            
        count += 1

./images/314oOH7ICvS.jpg
30cm/
60cm
cm
151,5

./images/314oOH7ICvS.jpg
151,5cm
30 cm
cm

./images/314p2NKHdLL.jpg
REBAR TYING MACHINE

./images/314p2NKHdLL.jpg
REBAR TYING MACHINE



In [46]:
# # ORIGINAL WORKING PADDLE-----------------------------------------

# import csv
# from paddleocr import PaddleOCR, draw_ocr
# import logging

# count = 0
# with open("./dataset/train.csv", 'r') as fh :
#     reader = csv.reader(fh)
#     count = 0
#     for row in reader :
#         if count != 0 and 4 <= count <= 7:
#             link = row[0]
#             reqd_entity = row[2]
#             exp_out = row[3]
            
#             download_image(link, "./images/")
#             image_name = link.split("/")[5]
#             img_path = './images/' + image_name
            
#             # Performing the preprocessing
#             step_1(img_path)
#             set_image_dpi(img_path)
#             remove_noise(img_path)
#             thresholding(img_path)
#             deskew_image(img_path)
    
#             logging.getLogger("ppocr").setLevel(logging.ERROR)
#             ocr = PaddleOCR(use_gpu = True, use_angle_cls=True, lang='en', show_logs = False) # Initialize PaddleOCR            
#             result = ocr.ocr(img_path)  # Perform OCR on the image
            

#             # Print the result
#             print(f"\n__________________{img_path}____________________\n")
#             print(f"\n__________________{reqd_entity}____________________\n")
#             print(f"\n__________________{exp_out}____________________\n")
            
#             text_input = ''
#             for line in result:
#                 for word_info in line:
#                     word = word_info[1][0]  # Extract the word
#                     #text_input += word + "\n"
                    
#                     words_to_pass = []
#                     if word.isalpha() :
#                         pass
                    
#                     elif word.isalnum() :
#                         words_in_word = word.split()
#                         for j in range(len(words_in_word)) :
#                             i = words_in_word[j]
                            
#                             # like 200ml
#                             if i.isalnum() :
#                                 words_to_pass.append(i)
                                
#                             # like 200, ml
#                             elif i.isalpha() :
#                                 try :
#                                     s = i + words_in_word[j + 1]
#                                 except :
#                                     pass
                            
                                
                            
                                
                        
#             print(text_input)
            
#             # Process the input text for all entities in entity_unit_map and print only relevant ones
#             output = process_text(text_input, reqd_entity)
#             # print(output)
# #             if output:
# #                 print(output)
# #             else : 
# #                 ''
            
#         count += 1

In [49]:
# ocr = PaddleOCR(use_angle_cls=True, lang='en', show_logs = False) # Initialize PaddleOCR
# img_path = "/home/suppra/Desktop/Amazon_ML/student_resource 3/images/hazy.jpeg"# Path to your image

# result = ocr.ocr(img_path)  # Perform OCR on the image


# # Print the result
# print(f"\n__________________{img_path}____________________\n")
# for line in result:
#     for word_info in line:
#         word = word_info[1][0]  # Extract the word
#         print(word)