In [66]:
from transformers import pipeline
from collections import defaultdict

# Replace this with your own checkpoint
# model_checkpoint = "/home/tanluuuuuuu/Desktop/luunvt/direct_indirect/models/model_from_2023-11-09/roberta-base_20:53:28/best_f1"
model_checkpoint = "/home/tanluuuuuuu/Downloads/roberta-base-v4-20231108T033610Z-001/roberta-base-v4/best_f1"
token_classifier = pipeline(
    "ner", model=model_checkpoint, aggregation_strategy="simple", device=0
)

In [2]:
def preprocess_description(description, words_need_removed = []):
    # add space to string
    single_description = description.strip()
    new_description = []
    last_special = -1
    for idx, letter in enumerate(single_description):
        if not (('a' <= letter and letter <= 'z') or ('A' <= letter and letter <= 'Z') or ('0' <= letter and letter <= '9') or letter == ' '):
            pretext = single_description[last_special + 1:idx].strip()
            if pretext != '' and pretext != ' ':
                new_description.append(pretext)
            new_description.append(letter.strip())
            last_special = idx
        if idx == len(single_description) - 1:
            new_description.append(
                single_description[last_special + 1:idx + 1].strip())
    new_description = " ".join(new_description)
    
    # Remove words from string: brand name,...
    querywords = new_description.split(" ")
    words_need_removed = [x.lower() for x in words_need_removed]
    resultwords  = [word for word in querywords if word.lower() not in words_need_removed]
    result = ' '.join(resultwords)
    
    return result

In [39]:
import json

# Replace this description
description = '''
No More Sweats - Unlike your memory foam cushion. Our advanced honeycomb grid design cooling gel coccyx seat cushion provides coolness while you are sitting, and it has a breathable cover, so you sit comfortable and cool
'''

high_score_ans = defaultdict(set)
bullet_points = description.split("\n")
for bullet_point in bullet_points:
    bullet_point = preprocess_description(bullet_point)

    if bullet_point != "":
        print(bullet_point)

        results = token_classifier(bullet_point)
        for res in results:
            if res['word'].lower().strip() in ['durable', 'strong', 'heavy-duty', 'heavy duty', 'stability', 'versatile', 'comfortable']:
                continue
            group = res['entity_group']
            if res['score'] >= 0.99:
                high_score_ans[group].add(res['word'].lower().strip() + " " + str(res['score']))
                
new_high_score_ans = defaultdict(list)
for key_dict in high_score_ans.keys():
    new_high_score_ans[key_dict] = list(high_score_ans[key_dict])

print("-"*100)            
print(json.dumps(new_high_score_ans, sort_keys=True, indent=4))                

No More Sweats - Unlike your memory foam cushion . Our advanced honeycomb grid design cooling gel coccyx seat cushion provides coolness while you are sitting , and it has a breathable cover , so you sit comfortable and cool
----------------------------------------------------------------------------------------------------
{
    "MAT": [
        "coccyx 0.9999512"
    ],
    "NMAT": [
        "memory foam 0.9999725"
    ],
    "PROPERTY": [
        "honeycomb grid design 0.9998339",
        "breathable cover 0.99994206"
    ]
}


In [40]:
import json

# Replace this description
description = '''
♥️ Rattan Material: PE
'''

high_score_ans = defaultdict(set)
bullet_points = description.split("\n")
for bullet_point in bullet_points:
    bullet_point = preprocess_description(bullet_point)

    if bullet_point != "":
        print(bullet_point)

        results = token_classifier(bullet_point)
        for res in results:
            if res['word'].lower().strip() in ['durable', 'strong', 'heavy-duty', 'heavy duty', 'stability', 'versatile', 'comfortable']:
                continue
            group = res['entity_group']
            if res['score'] >= 0.99:
                # high_score_ans[group].add(res['word'].lower().strip() + " " + str(res['score']))
                high_score_ans[group].add(res['word'].lower().strip())
                
new_high_score_ans = defaultdict(list)
for key_dict in high_score_ans.keys():
    new_high_score_ans[key_dict] = list(high_score_ans[key_dict])

print("-"*100)            
print(json.dumps(new_high_score_ans, sort_keys=True, indent=4))                

♥ ️ Rattan Material : PE
----------------------------------------------------------------------------------------------------
{
    "MAT": [
        "rattan",
        "pe"
    ]
}


In [41]:
import json
from spacy import displacy

# Replace this description
description = '''
💪 PREMIUM QUALITY - The CAPHAUS Kettlebells are made with virgin iron ore, not scrap, and formed into a strong, balanced, single-piece casting with a flat wobble-free base. Made with a clean, consistent surface and durable powder-coat finish
💪 PRECISION CASTING - The CAPHAUS Kettlebell is cast in one solid piece, this results in a stronger, more durable handle. Precision casting creates kettlebells with no seams, burrs, welded handles or rough spots
💪 FLAT BASE - Kettlebells need to be created with a flat base for easy storage and use during floor workouts, kettlebells without flat bases will roll around which makes it hard to store and use during movements like pushups, rows and renegade rows
💪 SMOOTH HANDLES - Easy to grip and hold onto but smooth so there is no discomfort in your hands during grueling workouts. Kettlebells with rough handles will chafe your hands and make it hard to complete a workout.
💪 AVAILABLE IN – 9lb/4kg; 13lb/6kg; 18lb/8kg; 26lb/12kg; 35lb/16kg; 44lb/20kg; 53lb/24kg; 62lb/28kg; 70lb/32kg; 80lb/36kg; 88lb/40kg. Marked in KGs and LBs

'''

ner_tags = [
"MAT",
"NMAT",
"DIMENSION",
"WEIGHT",
"TARGET_USER",
"PROPERTY",
"COLOR",
"SHAPE",
"SIZE",
]

colors = {
    "MAT": "#EEE",
    "NMAT": "#FFD",
    "DIMENSION": "#EFD",
    "WEIGHT": "#173",
    "TARGET_USER": "#379",
    "PROPERTY": "#FCF",
    "COLOR": "#FEF",
    "SHAPE": "#C9C",
    "SIZE": "#B8E"
         }
options = {"ents": ner_tags, "colors": colors}

high_score_ans = defaultdict(set)
bullet_points = description.split("\n")
for bullet_point in bullet_points:
    bullet_point = preprocess_description(bullet_point)

    if bullet_point != "":
        results = token_classifier(bullet_point)
        list_ents = []
        for otp in results:
            if otp['score'] < 0.99:
                continue
            list_ents.append({
                'label': otp['entity_group'],
                'start': otp['start'],
                'end': otp['end'],
            })
        doc_manual = {
            'text': bullet_point,
            'ents': list_ents
        }
        displacy.render(
            doc_manual,
            style='ent',
            manual=True,
            options=options
        )              

In [72]:
import json
from spacy import displacy

# Replace this description
description = '''
STURDY AND DURABLE : Our pet - friendly furniture constructed of durable , high - grade decorative steel mesh ( that serve as ventilation holes ) and innovative composite plastic - making this crate strong and stable
'''

ner_tags = [
"MAT",
"NMAT",
"DIMENSION",
"WEIGHT",
"TARGET_USER",
"PROPERTY",
"COLOR",
"SHAPE",
"SIZE",
]

colors = {
    "MAT": "#EEE",
    "NMAT": "#FFD",
    "DIMENSION": "#EFD",
    "WEIGHT": "#173",
    "TARGET_USER": "#379",
    "PROPERTY": "#FCF",
    "COLOR": "#FEF",
    "SHAPE": "#C9C",
    "SIZE": "#B8E"
         }
options = {"ents": ner_tags, "colors": colors}

high_score_ans = defaultdict(set)
bullet_points = description.split("\n")
for bullet_point in bullet_points:
    # bullet_point = preprocess_description(bullet_point)

    if bullet_point != "":
        results = token_classifier(bullet_point)
        display(results)
        list_ents = []
        for otp in results:
            if otp['score'] < 0.9:
                continue
            list_ents.append({
                'label': otp['entity_group'],
                'start': otp['start'],
                'end': otp['end'],
            })
        doc_manual = {
            'text': bullet_point,
            'ents': list_ents
        }
        displacy.render(
            doc_manual,
            style='ent',
            manual=True,
            options=options
        )
        print()

[{'entity_group': 'TARGET_USER',
  'score': 0.9998142,
  'word': ' pet',
  'start': 25,
  'end': 28},
 {'entity_group': 'MAT',
  'score': 0.99997246,
  'word': ' steel',
  'start': 99,
  'end': 104},
 {'entity_group': 'PROPERTY',
  'score': 0.99990344,
  'word': ' mesh',
  'start': 105,
  'end': 109},
 {'entity_group': 'MAT',
  'score': 0.99996614,
  'word': ' composite plastic',
  'start': 161,
  'end': 178}]


