In [86]:
from transformers import pipeline
import pandas as pd
import re
from tqdm import tqdm

In [2]:
qa_pipeline = pipeline('question-answering', model="deepset/roberta-base-squad2-distilled", device='cuda')
context = ""
question = "What is the %% mentioned in the text?"



In [4]:
texts = pd.read_excel("../dataset/extracted_texts.xlsx")
texts.head()

Unnamed: 0,Extracted Text
0,"1Q"" 7.1"" 9.8"" { 10"" 11.8"" 14.5"" 7.9"" 11.8"" 1530DI"
1,45cml17.72in 70cm/27 .69in 1 .56in
2,24*36inch 20*30inch 16*24inch 12*18inch 08*12i...
3,PRODUCT SPECIFICATIONS QUANTITY: X2 X2 1= 2.75...
4,ARVEdc PaoR IELARY VEDICEE ConJoldo fay HauT ...


In [87]:
def process_text(answer, entity):
    # Clean up the output of the LLM
    entity_unit_map = {
        "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
        "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
        "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
        "item_weight": {
            "milligram",
            "kilogram",
            "microgram",
            "gram",
            "ounce",
            "ton",
            "pound",
        },
        "voltage": {"millivolt", "kilovolt", "volt"},
        "wattage": {"kilowatt", "watt"},
        "item_volume": {
            "cubic foot",
            "microlitre",
            "cup",
            "fluid ounce",
            "centilitre",
            "imperial gallon",
            "pint",
            "decilitre",
            "litre",
            "millilitre",
            "quart",
            "cubic inch",
            "gallon",
        },
        "maximum_weight_recommendation": {
            "milligram",
            "kilogram",
            "microgram",
            "gram",
            "ounce",
            "ton",
            "pound",
        },
    }
    unit_short_form = {
        "cm": "centimetre",
        "mm": "millimetre",
        "m": "metre",
        "in": "inch",
        "ft": "foot",
        "yd": "yard",
        "g": "gram",
        "kg": "kilogram",
        "mg": "milligram",
        "lb": "pound",
        "oz": "ounce",
        "ton": "ton",
        "ug": "microgram",
        "lbs": "pound",
        "9": "gram",
        "ozs": "ounce",
        "mv": "millivolt",
        "kv": "kilovolt",
        "v": "volt",
        "kw": "kilowatt",
        "w": "watt",
        "cf": "cubic foot",
        "ul": "microlitre",
        "fl oz": "fluid ounce",
        "cl": "centilitre",
        "gal": "imperial gallon",
        "pt": "pint",
        "dl": "decilitre",
        "l": "litre",
        "ml": "millilitre",
        "qt": "quart",
        "cu in": "cubic inch",
        "gals": "gallon",
        "c in": "cubic inch",
        "cu ft": "cubic foot",
    }
    entity_unit = entity_unit_map[entity]
    regex_map = {
        "item_weight": r"[0-9IJO]+(\.[0-9IJO]+)?\s*(g|kg|mg|ug|oz|ton|lb)",
        "width": r"[0-9IJO]+(\.[0-9IJO]+)?\s*(cm|mm|m|in|ft|yd)",
        "depth": r"[0-9IJO]+(\.[0-9IJO]+)?\s*(cm|mm|m|in|ft|yd)",
        "height": r"[0-9IJO]+(\.[0-9IJO]+)?\s*(cm|mm|m|in|ft|yd)",
        "voltage": r"[0-9IJO]+(\.[0-9IJO]+)?\s*(mv|kv|v)",
        "wattage": r"[0-9IJO]+(\.[0-9IJO]+)?\s*(kw|w)",
        "item_volume": r"[0-9IJO]+(\.[0-9IJO]+)?\s*(cf|ul|fl oz|cl|gal|pt|dl|l|ml|qt|cu in|gals|c in|cu ft)",
        "maximum_weight_recommendation": r"[0-9IJO]+(\.[0-9IJO]+)?\s*(g|kg|mg|ug|oz|ton|lb)",
    }
    # Remove igh which will cause false positives
    answer = answer.replace("igh", "")
    # Match the regex ignoring case
    match = re.finditer(regex_map[entity], answer, re.IGNORECASE)
    # Process each match
    for m in match:
        # Extract the number and the unit
        # Extract the matched value
        value = m.group(0)
        # Extract the unit
        unit = re.search(
            r"\s*(cm|mm|m|in|ft|yd|g|kg|mg|ug|oz|ton|lb)", value, re.IGNORECASE
        )
        if not unit:
            continue
        unit = unit.group(0).strip()
        sstr = unit
        unit = unit.lower()
        if unit == "feet":
            unit = "foot"
        # Convert the unit to full form
        if unit in unit_short_form:
            unit = unit_short_form[unit]
        # Check if the unit is valid
        if unit in entity_unit:
            entity_value = value.replace(sstr, "").strip()
            # Replace I with 1 and O with 0
            entity_value = (
                entity_value.replace("I", "1")
                .replace("i", "1")
                .replace("J", "1")
                .replace("j", "1")
                .replace("O", "0")
                .replace("o", "0")
            )
            entity_value = float(entity_value)
            return str(entity_value) + " " + unit
    return None

In [80]:
for text in texts['Extracted Text']:
    context = text
    c_question = question.replace("%%", "item_weight")
    result = qa_pipeline(question=c_question, context=context)
    print(f"Question: {c_question}\nText: {text}\nAnswer: {result['answer']}")
    print(f"Processed Answer: {process_text(result['answer'], 'item_weight')}")
    print("")

Question: What is the item_weight mentioned in the text?
Text: 1Q" 7.1" 9.8" { 10" 11.8" 14.5" 7.9" 11.8" 1530DI
Answer: 1530DI
Processed Answer: None

Question: What is the item_weight mentioned in the text?
Text: 45cml17.72in 70cm/27 .69in 1 .56in
Answer: 70cm/27 .69in 1 .56in
Processed Answer: None

Question: What is the item_weight mentioned in the text?
Text: 24*36inch 20*30inch 16*24inch 12*18inch 08*12inch TYouVIGOT AUASTL FOR TERROR IFYOL"EGJI TAKE CARRIE J0 TF TROM ATASTEFO IFRKOR TNT CARRIn TonIn FrOV Horbot CARRIF MecietJekROT CARRIF CARRIF CARRIF CARRIF
Answer: 24*36inch 20*30inch
Processed Answer: None

Question: What is the item_weight mentioned in the text?
Text: PRODUCT SPECIFICATIONS QUANTITY: X2 X2 1= 2.75inch MATERLAL: Bamboo SIZE: 2.75" MATERLAL: 01 V Glass 2295" 1 8' CAPACITY: h160Z COLOR: Clear 2.95inch
Answer: 2.75inch
Processed Answer: None

Question: What is the item_weight mentioned in the text?
Text: ARVEdc PaoR  IELARY VEDICEE ConJoldo fay HauT @8 SneoxhMrar

In [81]:
# edge_cases = [
#     "2 x Mint Buttons 95g",
#     "1 Kg 22242",
#     "ktikin I5kG",
#     "2ml/o.O6oz",
#     "6x17g",
#     "J g",
#     "10Og",
#     "0.24 LBS",
#     "36 9",
#     "5KG WATERPROOF LOAD BEARING",
#     "MiLB/454 G DJARK ROAST BLUE 1007",
# ]
# for case in edge_cases:
#     print(f"Edge Case: {case}\nProcessed Answer: {process_text(case, 'item_weight')}")
#     print("")

Edge Case: 2 x Mint Buttons 95g
Processed Answer: 95.0 gram

Edge Case: 1 Kg 22242
Processed Answer: 1.0 kilogram

Edge Case: ktikin I5kG
Processed Answer: 15.0 kilogram

Edge Case: 2ml/o.O6oz
Processed Answer: 0.06 ounce

Edge Case: 6x17g
Processed Answer: 17.0 gram

Edge Case: J g
Processed Answer: 1.0 gram

Edge Case: 10Og
Processed Answer: 100.0 gram

Edge Case: 0.24 LBS
Processed Answer: 0.24 pound

Edge Case: 36 9
Processed Answer: None

Edge Case: 5KG WATERPROOF LOAD BEARING
Processed Answer: 5.0 kilogram

Edge Case: MiLB/454 G DJARK ROAST BLUE 1007
Processed Answer: 1.0 pound



In [83]:
train_data = pd.read_csv("../dataset/new_train.csv")
train_data.head()

Unnamed: 0.1,Unnamed: 0,image_link,group_id,entity_name,entity_value,extracted_text
0,0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,PROPOS' NATUREJ INGREDIENT MENAGER MULTI-USAGE...
1,1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,LLa eelE=_ 62 R RRIFICH LEBENSMITTELECHT Cwwv ...
2,2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,COMPOSITION Serving Size: Tablet (0.709 g) Eac...
3,3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,3 3 1 1 I I IW! # 5833 1 3 1 1 1 H 0 L 1 W # I...
4,4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,Horbaach' HIGH StRE NGTH PSYLLIUM HUSK PLANTAG...


In [122]:
TP, FP, FN, TN = 0, 0, 0, 0
for index, row in tqdm(train_data.iterrows(), total=len(train_data)):
    text = row["extracted_text"]
    if pd.isna(text):
        continue
    entity_name = row["entity_name"]
    context = text
    c_question = question.replace("%%", entity_name)
    result = qa_pipeline([{"question": c_question, "context": context}])
    OUT = process_text(result["answer"], entity_name)
    GT = row["entity_value"]
    # True Positive
    if GT != "" and OUT != "" and GT == OUT:
        TP += 1
    # False Positive
    if (GT == "" and OUT != "") or (GT != "" and OUT != "" and GT != OUT):
        FP += 1
    # False Negative
    if GT != "" and OUT == "":
        FN += 1
    # True Negative
    if GT == "" and OUT == "":
        TN += 1

100%|██████████| 6782/6782 [02:47<00:00, 40.47it/s]


In [92]:
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Precision: 0.19142645971914266
Recall: 1.0
F1: 0.32133995037220847


In [93]:
print(f"TP: {TP}")
print(f"FP: {FP}")
print(f"FN: {FN}")
print(f"TN: {TN}")

TP: 1295
FP: 5470
FN: 0
TN: 0
