In [1]:
import pandas as pd
import json
import re
from nltk.tokenize import RegexpTokenizer

In [2]:
def __tokenize(text: str) -> str:
        """tokenize a text into tokens and join them on ' '

        :param text: text to be tokenized
        :type text: str
        :return: preprocessed string
        :rtype: str
        """
        # for now simple tokenizer keeping only word characters with length > 1
        tokenizer = RegexpTokenizer(r'\w+')
        token_list = tokenizer.tokenize(text)

        return ' '.join([w.lower() for w in token_list if len(w) >= 1])

def process_text_data(df, text_columns):
        """
        process text data
        :param text_columns: list[str]
        :return: self object
        :rtype: TProcessorIngredient
        """
        if df is not None:
            #df = df.dropna()
            df = df.fillna('')
            df[text_columns] = df[text_columns].astype(str)
            df['features'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

            df['features'] = df['features']\
                .str.lower()\
                .str.replace('ø? ?[0-9]*(,|\\.)*[0-9]+ ?(l|ml|cl|g|mg|cm|gr|er) ', ' ')\
                .str.replace('\'s', ' ')\
                .str.replace('(7 up|7-up)', '7up')\
                .str.replace('<br> ?(-|\\+)?', ' ')

            df['features'] = \
                df['features'].str.split(' ')

            df['features'] = \
                [[re.sub('\W+', ' ', each) for each in t] for t in df['features']]

            df['features'] = \
                [[re.sub('[0-9]+ ?$', ' ', each) for each in t] for t in df['features']]

            df['features'] = \
                [[word.strip() for word in s] for s in df['features']]

            df['features'] = \
                [[word for word in s if len(word) >= 2] for s in df['features']]

            df['features'] = \
                df['features'].apply(lambda s: list(filter(None, s)))

            df['features'] = \
                [' '.join(s) for s in df['features'].values]

            # df['features'] = [spell_correction.naive_correction(word) for word in df['features'].values]

            df['features_token'] = df['features'].apply(__tokenize)

        return df
    
def doccano_to_defined_format(train_path):
    f = open(train_path, 'r', encoding='utf-8')
    lines = []
    for line in f:
        line = json.loads(line)
        
        text = line['text']
        token_offsets = line['labels']
        tokens = []
        for token_off in token_offsets:
            token = text[token_off[0]:token_off[1]].strip()
            tokens.append([token, [token_off[0], token_off[0]+len(token), token_off[-1]]])
        lines.append([text, tokens])
    return lines



def mapping_entities_to_unprocessed_data(row):
    text = row.original_name
    tags = row.tag
    tags = sorted(row.tag, key=lambda x: x[1][0])
    entities = []
    for tag in tags:
        tag_tokens = tag[0].split(' ')
        tag_loc = tag[1]
        start_loc = 0
        end_loc = entities[-1][1] if len(entities)>0 else 0
        flag = 1
        try:
            for token in tag_tokens:
                # baguettebrot - baguette + brot
                patterns = re.compile( r'\b' + token + r'\w*\b')
                start_loc_new, end_loc = patterns.search(text, end_loc).span()
                if flag:
                    start_loc=start_loc_new
                    flag =0

            entities.append([start_loc, end_loc, tag_loc[-1]])
            if(end_loc-start_loc < tag_loc[1]-tag_loc[0]):
                print("MATCHED TOKEN SORTER THAN ORIGINAL: ",
                      text, tag,
                      '(', start_loc, ',', end_loc, ')')
        except:
            continue
            print("MIDDLE STRING ING", text, tag)
        
    
    matched = 0
    if(len(entities) == len(tags)):
        matched = 1
        
    start_len = len(set([item[0] for item in entities]))
    end_len = len(set([item[1] for item in entities]))
    ent_len = len(entities) 
    if ((start_len != ent_len) | (end_len != ent_len)):
        print("DUPLICATES PROBLEM: ", text)
        print(tags)
        print(' |'.join([' |'.join([text[entity[0]:entity[1]],
                                    str(entity[0]),
                                    str(entity[1])]) for entity in entities]))
        print('')
        
    return pd.Series({'text':row.naam + ' ' + row.omschrijving, 'entities':entities, 'match_all': matched})

def get_file_path(lst):
    return '/'.join(lst)
 

In [3]:
def save_to_file(df, file_name):
    f = open(filename, 'w')
    for _, row in df[['text', 'entities']].rename(columns ={'entities':'labels'}).iterrows():
        tmp = json.dumps(row.to_dict(), ensure_ascii=False)
        f.write(tmp)
        f.write("\n")

In [4]:
meta_data = {
    "path_ner_data_folder": "ner_datasets",
    "path_annotated_data_file" : "FULL_SET_CORRECTED.json1",
    "path_data_folder": "data",
    "path_prod_file": "product_DE.csv"
}

In [29]:
df = pd.read_csv('data/verified_original_data.csv')

In [30]:
df['origin_lpn'] = df['origin_lpn'].str.strip()

In [41]:
df = df.drop_duplicates(subset='origin_lpn')

In [43]:
df_train = pd.DataFrame(doccano_to_defined_format('ner_datasets/TRAIN_FINAL_CORRECTED.json1'), columns=['origin_lpn', 'tag'])
df_train = df_train.drop_duplicates(subset='origin_lpn')

In [25]:
df.drop(['processed_text', 'tag'], axis=1).merge(df_train, on='origin_lpn')#.to_csv('data/verified_original_data_new.csv', index=False)

Unnamed: 0,product_id,naam,omschrijving,lower_lpn,origin_lpn,tag
0,5874771,Sprite,1.0 l,sprite 1.0 l,Sprite 1.0 l,"[[Sprite, [0, 6, Brand]]]"
1,5874771,Sprite,1.0 l,sprite 1.0 l,Sprite 1.0 l,"[[Sprite, [0, 6, Brand]]]"
2,18768851,Sprite,"0,33 l","sprite 0,33 l","Sprite 0,33 l","[[Sprite, [0, 6, Brand]]]"
3,18768851,Sprite,"0,33 l","sprite 0,33 l","Sprite 0,33 l","[[Sprite, [0, 6, Brand]]]"
4,10929132,Sprite,"0,5 l","sprite 0,5 l","Sprite 0,5 l","[[Sprite, [0, 6, Brand]]]"
...,...,...,...,...,...,...
4134,273394278,"Pizza Fritz [Klein, Ø 20cm]","mit Gyros, Peperoni, Zwiebeln und Thunfisch","pizza fritz [klein, ø 20cm] mit gyros, peperon...","Pizza Fritz [Klein, Ø 20cm] mit Gyros, Peperon...","[[Pizza, [0, 5, Ing]], [Gyros, [32, 37, Ing]],..."
4135,281334468,Ben & Jerry's Salted Caramel Brownie Topped [5...,Das Salted Caramel Brownie Topped besteht aus ...,ben & jerry's salted caramel brownie topped [5...,Ben & Jerry's Salted Caramel Brownie Topped [5...,"[[Ben & Jerry, [0, 11, Brand]], [Salted Carame..."
4136,240625374,Kiddy Box,• 3 Chicken Nuggets <br> • Pommes frites <br> ...,kiddy box • 3 chicken nuggets <br> • pommes fr...,Kiddy Box • 3 Chicken Nuggets <br> • Pommes fr...,"[[Chicken Nuggets, [14, 29, Ing]], [Pommes fri..."
4137,66798435,Ben & Jerry's Chocolate Fudge Brownie 500ml,sündhaft cemige Schokoladenkuchen-Eiscreme,ben & jerry's chocolate fudge brownie 500ml sü...,Ben & Jerry's Chocolate Fudge Brownie 500ml sü...,"[[Ben & Jerry, [0, 11, Brand]], [Chocolate Fud..."


In [24]:
df_train

Unnamed: 0,origin_lpn,tag
0,"Sprite 1,0l","[[Sprite, [0, 6, Brand]]]"
1,"Sprite 0,33l","[[Sprite, [0, 6, Brand]]]"
2,Sprite 1.5l,"[[Sprite, [0, 6, Brand]]]"
3,Sprite 1l,"[[Sprite, [0, 6, Brand]]]"
4,"Sprite 0,5l","[[Sprite, [0, 6, Brand]]]"
...,...,...
10321,Ben & Jerry's Chocolate Fudge Brownie 500ml sü...,"[[Ben & Jerry, [0, 11, Brand]], [Chocolate Fud..."
10322,"Pizza Ramazotti [Klein, Ø 20cm] mit Salami, Sc...","[[Pizza, [0, 5, Ing]], [Salami, [36, 42, Ing]]..."
10323,"Warsteiner Pils 0,5l","[[Warsteiner, [0, 10, Brand]], [Pils, [11, 15,..."
10324,"Warsteiner Pils 0,3l","[[Warsteiner, [0, 10, Brand]], [Pils, [11, 15,..."


In [10]:
data_annotation = doccano_to_defined_format(get_file_path(
    [meta_data['path_ner_data_folder'], 
     meta_data['path_annotated_data_file']]))
df_annotation = pd.DataFrame(data_annotation, columns = ['processed_text', 'tag'])
print(df_annotation.shape)
df_annotation = df_annotation.drop_duplicates(subset='processed_text')
print(df_annotation.shape)

(3999, 2)
(2754, 2)


In [13]:
df_original = pd.read_csv(get_file_path(
    [meta_data['path_data_folder'], 
     meta_data['path_prod_file']])).fillna('')
df_original.shape

(9459170, 3)

In [14]:
df_original['original_name'] = df_original.apply(lambda x: x.naam + ' ' + x.omschrijving, axis=1).str.lower() 
df_original = df_original.drop_duplicates(subset='original_name')
df_original.shape

(4828107, 4)

In [18]:
df_original = process_text_data(df_original, ['original_name'])
df_original.shape

(4828107, 6)

In [49]:
df_final[['product_id', 'naam', 'omschrijving', 'original_name', 'processed_text', 'tag']].to_csv('data/verified_original_data.csv', index=False)

In [22]:
df_final = df_annotation.merge(df_original, left_on='processed_text', right_on='features_token')
df_final.shape

(12977, 8)

In [29]:
df_result = df_final.apply(mapping_entities_to_unprocessed_data , axis=1)
df_result.shape

(12977, 3)

In [30]:
df_result

Unnamed: 0,text,entities,match_all
0,"Sprite 1,0l","[[0, 6, Brand]]",1
1,"Sprite 0,33l","[[0, 6, Brand]]",1
2,Sprite 1.5l,"[[0, 6, Brand]]",1
3,Sprite 1l,"[[0, 6, Brand]]",1
4,"Sprite 0,5l","[[0, 6, Brand]]",1
...,...,...,...
12972,Ben & Jerry's Strawberry Cheesecake 500ml Erdb...,"[[0, 11, Brand], [14, 35, Flavor], [115, 126, ...",1
12973,Ben & Jerry's Strawberry Cheesecake 100ml Erdb...,"[[0, 11, Brand], [14, 35, Flavor], [115, 126, ...",1
12974,Ben & Jerry's Strawberry Cheesecake 500ml Erdb...,"[[0, 11, Brand], [14, 35, Flavor], [114, 125, ...",1
12975,"Ouzo Spezial-Teller mit Gyros, Schnitzel, Hähn...","[[0, 4, Ing], [24, 29, Ing], [31, 40, Ing], [4...",1


In [25]:
save_to_file(df_result, 'doccano_format_2.jsonl')

NameError: name 'filename' is not defined

[['Sprite 1,0l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 0,33l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 1.5l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 1l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 0,5l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 0,2l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 1.0 l', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 1,5l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite - 1,0 l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 0,33 l', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 0.33l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 1,5 l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 1.0l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite ', {'entities': [[0, 6, 'Brand']]}],
 ['1l Sprite ', {'entities': [[3, 9, 'Brand']]}],
 ['Sprite 0,3l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 1.00 l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 0,5 l', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 0.5l ', {'entities': [[0, 6, 'Brand']]}],
 ['Sprite 1,

In [31]:
df_result.apply(lambda )

Unnamed: 0,text,entities,match_all
0,"Sprite 1,0l","[[0, 6, Brand]]",1
1,"Sprite 0,33l","[[0, 6, Brand]]",1
2,Sprite 1.5l,"[[0, 6, Brand]]",1
3,Sprite 1l,"[[0, 6, Brand]]",1
4,"Sprite 0,5l","[[0, 6, Brand]]",1
...,...,...,...
12972,Ben & Jerry's Strawberry Cheesecake 500ml Erdb...,"[[0, 11, Brand], [14, 35, Flavor], [115, 126, ...",1
12973,Ben & Jerry's Strawberry Cheesecake 100ml Erdb...,"[[0, 11, Brand], [14, 35, Flavor], [115, 126, ...",1
12974,Ben & Jerry's Strawberry Cheesecake 500ml Erdb...,"[[0, 11, Brand], [14, 35, Flavor], [114, 125, ...",1
12975,"Ouzo Spezial-Teller mit Gyros, Schnitzel, Hähn...","[[0, 4, Ing], [24, 29, Ing], [31, 40, Ing], [4...",1
