In [1]:
import csv
import exrex # !pip install exrex
import json 
import pandas as pd
import numpy as np
import re 
import spacy

from tqdm import tqdm
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

# ebablbe auto-completion
%config Completer.use_jedi = False

## Load data

In [40]:
def generate_entity(texts, patterns={}):   
    
    def entity_from_patterns(text, patterns, pattern_matchs):        
        entities = {}
#         print('-'*50)  
        for pattern in patterns:
            entities = entity_from_pattern(text, pattern, entities, pattern_matchs)
        entities = [value for _, value in sorted(entities.items(), key=lambda item: item[0][0])]
        return entities

    def entity_from_pattern(text, pattern, entities, pattern_matchs):
        expression = pattern['expression']
        label = pattern['label']
        for match in re.finditer(expression, text):
            start, end = match.span()    
            add_key = True
            if start-1 >= 0 and text[start-1]!=' ':
                add_key = False
            if end < len(text) and text[end]!=' ':
                add_key = False                
                
#             drop_keys = []
#             for key, value in entities.items():
#                 start_, end_ = key
#                 if start<start_ and end>end_:
#                     drop_keys.append(key)
#                 if start>start_ and end<end_:
#                     add_key = False
#             for key in drop_keys:
#                 entities.pop(key)

            if add_key: 
#                 print(text, expression)
                pattern_matchs[expression] = pattern_matchs[expression] + 1
                entities[(start, end)] = (start, end, label)
        return entities
        
    examples = []
    pattern_matchs = {pattern['expression']:0 for pattern in patterns}
    for text in texts:
        entities = entity_from_patterns(text, patterns, pattern_matchs)        
        example = (text, {"entities": entities})
        examples.append(example)
    pattern_matchs = [(key, value) for key, value in sorted(pattern_matchs.items(), key=lambda item: -item[1])]
#     pattern_matchs = [(key, value) for key, value in pattern_matchs if value>0]
        
    return examples, pattern_matchs

def show_entity(text_entities, pattern_matchs):
    for example in text_entities:
        print('-'*50)
        text, annotations = example
        print(text) 
        print(*[(text[start:end], start, end, label) for start, end, label in annotations['entities']], sep='\n')
#     print('='*50)
#     print(*pattern_matchs, sep='\n')
    
options = {'colors': {'ACTION':"#56D7C4", 'PART_NUM':"#92E0AA"} }

def show_text(text, annotations=None, options=options, show_detail=True, standardize_text=None):
    if standardize_text is not None:
        text = standardize_text(text)
    doc = nlp(text)
    print('='*100)
    if show_detail:
        print(doc.text) 
    spacy.displacy.render(doc, style='ent', options=options)
    if show_detail and annotations is not None and len(annotations['entities'])>0:
        print('-'*25, 'acutal entities', '-'*25)
        print(*[(text[start:end], start, end, label) for start, end, label in annotations['entities']], sep='\n')   
    if show_detail and len(doc.ents)>0:
        print('-'*25, 'predict entities', '-'*25)
        print(*[(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], sep='\n')      

def show_texts(data, indexes=None, options=options, show_detail=True, standardize_text=None):
    def show_text_(data):
        if isinstance(data, str):
            show_text(data, None, options=options, show_detail=show_detail, standardize_text=standardize_text)
        else:
            text, annotations = data
            show_text(text, annotations, options=options, show_detail=show_detail, standardize_text=standardize_text)  
            
    if indexes is not None:
        for i in indexes:
            show_text_(data[i])   
    else:
        for one_data in data: 
            show_text_(one_data)      
            
def standardize_text1(text):
    text = text.strip().lower()
    # add space between punctuation
    text = re.sub(r'([.\\!?,\'()\[\]"|;])', r' \1 ', text)
#     # remove characters except for English letters and some punctuations
#     text = re.sub(r"[^A-Za-z\.\-\?\!\,\#\@\% ]", "", text)
    # remove extra spaces
    text = re.sub(r'[" "]+', " ", text)
    return text            

In [3]:
df_case = pd.read_pickle('./data/df_case.pkl') 
df_train_test = pd.read_pickle('./data/df_train_test.pkl') 

df_case

Unnamed: 0,Case_ID,Work_Order_Number,Created_On,Service_Account,Due_Date,Action_Booking_status,Action_Date,Currently_Worked_by,Owner,Product_Line,...,Response_Time,Work_Order_Type,Postal_Code,Active_Booking,Follow_Up_Type,Business_Segment,Follow_Up_Note,Reschedule_Note,id,notes
1,5060815932,WO-012952625,2021-02-03 10:38:00,William Torres,2021-04-26 15:00:00,Additional Parts Required,2021-07-03 11:45:00,Kerby Singleton,Nancy Quesada,Commercial Desktop PC Premium L6,...,NCD,Break Fix,30305,PC-Kerby Singleton,Additional Parts Required,Computing,,#NORMAL Part escalated,1,#NORMAL Part escalated
2,5060815932,WO-012952625,2021-02-03 10:38:00,William Torres,2021-04-26 15:00:00,Additional Parts Required,2021-07-03 11:45:00,Kerby Singleton,Nancy Quesada,Commercial Desktop PC Premium L6,...,NCD,Break Fix,30305,PC-Kerby Singleton,Additional Parts Required,Computing,,Please cancel backordered part. Customer has c...,2,Please cancel backordered part. Customer has c...
3,5061272897,WO-013009621,2021-02-09 13:39:00,LAS VEGAS METRO POLICE DEPT,2021-02-24 19:00:00,Additional Parts Required,2021-07-16 15:30:00,,Avinash Shukla,Commercial Premium Notebooks,...,NCD,Break Fix,89101,PC-Joe Walper,Additional Parts Required,Computing,,According to bryan this laptop shows repair co...,3,According to bryan this laptop shows repair co...
4,5061272897,WO-013009621,2021-02-09 13:39:00,LAS VEGAS METRO POLICE DEPT,2021-02-24 19:00:00,Additional Parts Required,2021-07-16 15:30:00,,Avinash Shukla,Commercial Premium Notebooks,...,NCD,Break Fix,89101,PC-Joe Walper,Additional Parts Required,Computing,,Back order part,4,Back order part
5,5061278521,WO-013010632-4,2021-06-16 12:17:00,MR. REGINALD JOSEPH,2021-07-15 14:42:00,Reschedule,2021-07-15 13:09:00,,JOCILYN ANDREA SANDI DURAN,Commercial Premium Notebooks,...,NCD,Break Fix,20785,USA-PR-USYWWDWN_E_A_EEG,Reschedule,Computing,,,5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7035,5069292833,WO-014560602,2021-07-16 16:23:00,Kaiser Foundation Health Plan Inc.,2021-07-19 15:09:00,Additional Parts Required,2021-07-16 16:00:00,,6 CSPAE166,A4 PageWide SMB & Enterprise managed,...,Not available,Break Fix,91786,Anthony Garcia,Additional Parts Required,pMPS,,Please order the following part. Qty. 1 B5L04...,7035,Please order the following part. Qty. 1 B5L04...
7036,5069293232,WO-014560595,2021-07-16 16:21:00,CDW - St. Lukes Hospital (Government),2021-07-20 16:26:00,Request Reassign,,,1 CSPAE131,A4 Laserjet SMB,...,Not available,Break Fix,55805,Brock Tharp,Request Reassign,pMPS,,Please reassign to lms,7036,Please reassign to lms
7037,5069293974,WO-014560699,2021-07-16 17:06:00,LITHIA NISSAN OF FRESNO,2021-07-19 18:00:00,Request Reassign,2021-07-19 13:00:00,Aaron Stavinsky,Luis Alejandro Vargas,A4 Laserjet Enterprise,...,NCD,Preventative Maintenance,93710,IS-Aaron Stavinsky,Request Reassign,Printing,,dahlia.garcia@hp.com,7037,dahlia.garcia@hp.com
7038,5069294340,WO-014560697,2021-07-16 17:04:00,"Best Buy co., Inc.",2021-07-20 18:00:00,,,,Subir Das,Retail Solutions Core,...,NCD,Break Fix,95678,PR-David Popadiuc,Revisit,Computing,Please order 842275-001 and have it ship to HF...,,7038,Please order 842275-001 and have it ship to HF...


In [4]:
with open('./data/part_num_patterns.json') as json_file:
    part_num_patterns = json.load(json_file)
    
with open('./data/common_words_patterns.json') as json_file:
    common_words_patterns = json.load(json_file)
    
with open('./data/texts.json') as json_file:
    texts = json.load(json_file)    
    
with open('./data/train_texts.json') as json_file:
    train_texts = json.load(json_file)
    
with open('./data/test_texts.json') as json_file:
    test_texts = json.load(json_file)    
    
with open('./data/train_text_entities.json') as json_file:
    train_text_entities = json.load(json_file)
    
with open('./data/test_text_entities.json') as json_file:
    test_text_entities = json.load(json_file)    
    

## Train the model

In [5]:
# !python -m spacy train config_gpu_aug.cfg --output ./output/gpu_aug --paths.train ./data/train_aug.spacy --paths.dev ./data/test.spacy --gpu-id 0

## Evaluate

###  Check test data

In [6]:
# !python -m spacy evaluate output/gpu_aug/model-best ./data/test.spacy --gpu-id 0

2021-08-25 11:18:12.394972: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   97.15 
NER R   100.00
NER F   98.55 
SPEED   4207  

[1m

               P        R       F
PART_NUM   97.15   100.00   98.55



In [7]:
!python -m spacy evaluate output/gpu_aug/model-best ./data/test_aug.spacy --gpu-id 0

2021-08-25 11:18:32.519661: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   98.23 
NER R   99.93 
NER F   99.07 
SPEED   4491  

[1m

               P       R       F
PART_NUM   98.23   99.93   99.07



In [8]:
nlp = spacy.load("output/gpu_aug/model-best") 

In [10]:
indexes = np.random.randint(0, len(test_text_entities), 10)
show_texts(test_text_entities, indexes) 

b5l04-67906 , 1 , c2b9es44 l0r16a , 1 , c2b9es44


------------------------- acutal entities -------------------------
('b5l04-67906', 0, 11, 'PART_NUM')
------------------------- predict entities -------------------------
('b5l04-67906', 0, 11, 'PART_NUM')
please order part rm2-0906-000cn , qty 1 and please order part rm3-8461-000cn , qty 1 then please ship to customer: kim skeeba , cooper tire , 8000 quarry rd , alburtis pa 18011 . 6/23 - part has not shipped shipped scheduledl with customer


------------------------- acutal entities -------------------------
('rm2-0906-000cn', 18, 32, 'PART_NUM')
('rm3-8461-000cn', 63, 77, 'PART_NUM')
------------------------- predict entities -------------------------
('rm2-0906-000cn', 18, 32, 'PART_NUM')
('rm3-8461-000cn', 63, 77, 'PART_NUM')
#normal please reassign this work order to technician jessie brennan




ce265a , 1 , c2dhes39


no news


jc95-01943a , 1 , c2aaes26 . #normal


------------------------- acutal entities -------------------------
('jc95-01943a', 0, 11, 'PART_NUM')
------------------------- predict entities -------------------------
('jc95-01943a', 0, 11, 'PART_NUM')
07/21 - cu unreachable-tech called cu 2x no answer-monitoring


lex-40x7220 , 1 , c2eges22


------------------------- acutal entities -------------------------
('lex-40x7220', 0, 11, 'PART_NUM')
------------------------- predict entities -------------------------
('lex-40x7220', 0, 11, 'PART_NUM')
655 paper mill rd newark de 19711 mail code de5-013-00-05 please order part #l15522-601 please send it to russell lester 655 paper mill rd newark de 19711 mail code de5-013-00-05


talked to client . the laptop won ' t be back for another week . owner took it on vacation . sent email last night and called today . no answer . left vmail . received e-mail saying his client is away for 2 weeks . proponing for 2 weeks . 


In [11]:
show_texts(test_text_entities, indexes, show_detail=False) 





















### Check Error Data

In [12]:
def get_error_data(data):
    error_data = []
    for i in range(len(data)):
        text, annotations = data[i]
        doc = nlp(text)
        entities = annotations['entities']
        if len(entities) != len(doc.ents):
            error_data.append(data[i])
            print(i, len(error_data))
        else:
            exit_flag = False
            actual_entities = {(start, end, label): label for start, end, label in entities}
            predict_entities = {(ent.start_char, ent.end_char, ent.label_): ent.label_ 
                                for ent in doc.ents}
            
            for key, _ in actual_entities.items():                
                if key not in predict_entities:
                    print(actual_entities, predict_entities, sep='\n') 
                    error_data.append(data[i])
                    print(i, len(error_data))
                    break
            if exit_flag: continue
            for key, _ in predict_entities.items():
                if key not in actual_entities:
                    print(actual_entities, predict_entities)
                    error_data.append(data[i])
                    print(i, len(error_data))
                    break
                
    return error_data

error_data = get_error_data(test_text_entities)

56 1
64 2
89 3
95 4
151 5
176 6
181 7
194 8
204 9
243 10
488 11
504 12
519 13
583 14
687 15
689 16
921 17
980 18
1066 19
1206 20
1336 21
1380 22
1422 23
1431 24
1500 25


In [13]:
print(len(error_data))

25


In [14]:
show_texts(error_data)  

please order rm2-1278-000cn ( qty 1 ) and send to hfpu in lockport , il . 3gy10a-cnncmc6337-hp laserjet managed e60165dn printer case: 5067812295 wo-014252961 po119240397 0645153938 neovia 3930 cedar creek dr joliet , il . 60436 larry heagle 1815-768-3282 alt: 779-702-0498 rm2-1278-000cn assembly-door link e601xx 51771292


------------------------- acutal entities -------------------------
('rm2-1278-000cn', 13, 27, 'PART_NUM')
('rm2-1278-000cn', 274, 288, 'PART_NUM')
------------------------- predict entities -------------------------
('rm2-1278-000cn', 13, 27, 'PART_NUM')
('1815-768-3282', 242, 255, 'PART_NUM')
('779-702-0498', 261, 273, 'PART_NUM')
('rm2-1278-000cn', 274, 288, 'PART_NUM')
please send maintenance kit fixing assembly ( 110v ) rm2-5476-000 1 . #normal part not available checking part #


------------------------- predict entities -------------------------
('rm2-5476-000', 53, 65, 'PART_NUM')
please order part# j8j93–67901 nbd to address on work order


------------------------- predict entities -------------------------
('j8j93–67901 nbd', 19, 34, 'PART_NUM')
order 1ea b5l47-67018 b5l47-67019 . part on b-o


------------------------- acutal entities -------------------------
('b5l47-67018', 10, 21, 'PART_NUM')
('b5l47-67019', 22, 33, 'PART_NUM')
------------------------- predict entities -------------------------
('b5l47-67018', 10, 21, 'PART_NUM')
('b5l47-67019', 22, 33, 'PART_NUM')
('b-o', 44, 47, 'PART_NUM')
#normal printer have error code 50 . 3f . 11 high temperature error 1 contacted tech assist for a new action plan . to replace rl1-4005-000 fuser power supply assembly


------------------------- predict entities -------------------------
('rl1-4005-000', 127, 139, 'PART_NUM')
#normal customer requests service at this time order :917725-855 and l43407-001 and 213349-001 for next day ship to customers site attn kim gordin thank you


------------------------- acutal entities -------------------------
('l43407-001', 69, 79, 'PART_NUM')
('213349-001', 84, 94, 'PART_NUM')
------------------------- predict entities -------------------------
('917725-855', 54, 64, 'PART_NUM')
('l43407-001', 69, 79, 'PART_NUM')
('213349-001', 84, 94, 'PART_NUM')
need to order nib kit and ecu ( jz09-67077 & g1w39-67001 ) ship to san antonio hal


------------------------- acutal entities -------------------------
('g1w39-67001', 45, 56, 'PART_NUM')
------------------------- predict entities -------------------------
('jz09-67077', 32, 42, 'PART_NUM')
('g1w39-67001', 45, 56, 'PART_NUM')
i ' m ordering the following parts for this customer rm26418-000cn qty=1 ship to 5200 west greens road houston texas 77066 . #normal


------------------------- predict entities -------------------------
('rm26418-000cn', 53, 66, 'PART_NUM')
need revisit for 7/2 and please order 0609-001558 qty 1 nbd to 83005433


------------------------- predict entities -------------------------
('0609-001558', 38, 49, 'PART_NUM')
please send the following part to this address james jones c/o imd lamar mosley army community hospital 1585 3rd st . building 285 fort polk , louisiana 71459-5110 rm2-1231-000cn . spoke with the customer as i was about to leave and he said that they have put the printer on a new hci and it is working properly said the hci that came with a printer was physically damaged . customer wants an hci sent to the location i am providing the


------------------------- acutal entities -------------------------
('rm2-1231-000cn', 164, 178, 'PART_NUM')
------------------------- predict entities -------------------------
('71459-5110', 153, 163, 'PART_NUM')
('rm2-1231-000cn', 164, 178, 'PART_NUM')
please revisit for tomorrow from 8:00 am to 10:00 am and order pn ' s 831757-001and 801360-001 nbd to hal location in knoxville tennessee


------------------------- acutal entities -------------------------
('801360-001', 84, 94, 'PART_NUM')
------------------------- predict entities -------------------------
('831757-001and', 70, 83, 'PART_NUM')
('801360-001', 84, 94, 'PART_NUM')
cannot wait for rob to call me back for the other part . will add it after . please order parts: duplex solenoid lex-40x8301 controller board lex-40x9253 ship to hfpu  ( 83004794 ) 16241 s farrell road , lockport , il . 60441 set for friday so i can get the . lex88932563-70157ghh1dy96-lexmark mx511dhe mono mfp printer case: 5068538396 wo-014405153 fw: nh63 . sb . n638 10 . 73 . 39 . 111 w01991 events printer huison sanchez paper jam , open rear door 230 . 03 affecting only multiple users 7/6 costco wholesale


------------------------- acutal entities -------------------------
('lex-40x8301', 113, 124, 'PART_NUM')
('lex-40x9253', 142, 153, 'PART_NUM')
------------------------- predict entities -------------------------
('lex-40x8301', 113, 124, 'PART_NUM')
('lex-40x9253', 142, 153, 'PART_NUM')
('lex88932563-70157ghh1dy96-lexmark', 260, 293, 'PART_NUM')
7/8- please create follow up and assign to myself for revisit reimage failed re image computer follow up with lissett 619-507-8334 to make sure it was succesful job id:11625681039975


------------------------- predict entities -------------------------
('619-507-8334', 118, 130, 'PART_NUM')
7/7/21--1020--please order parts below: g1w39-69001 / mech-kit-v1-pwe-586mfp-ww g1w39-67952 / kit-assy/feeder tray 3 please ship to client address below: norton community hospital attn:  eryk nayagam -- is 100 15th st nw norton , va 24273 just go pro


------------------------- acutal entities -------------------------
('g1w39-69001', 40, 51, 'PART_NUM')
('g1w39-67952', 80, 91, 'PART_NUM')
------------------------- predict entities -------------------------
('g1w39-69001', 40, 51, 'PART_NUM')
('mech-kit-v1', 54, 65, 'PART_NUM')
('g1w39-67952', 80, 91, 'PART_NUM')
please order lcd cable l32713-001 and backlight cable l3291-001 . #normal 7/15 please order lcd cable l32713-001 and backlight cable l3291-001 replaced display and system board but back light is still dim , verified parts and issue with support , signed for jn kp 7/14 parts have not shipped per fedex


------------------------- acutal entities -------------------------
('l32713-001', 23, 33, 'PART_NUM')
('l32713-001', 102, 112, 'PART_NUM')
------------------------- predict entities -------------------------
('l32713-001', 23, 33, 'PART_NUM')
('l3291-001', 54, 63, 'PART_NUM')
('l32713-001', 102, 112, 'PART_NUM')
('l3291-001', 133, 142, 'PART_NUM')
please order two ( 2 ) of this part number: jc31-00163b ship both to hfpu  ( 83004794 ) 16241 s farrell road , lockport , il . 60441 . x3a76a-cnc1n4r00z-hp color laserjet managed flow mfp e87640-e87660z - base product 40-60 ppm a3 case: 5068847832 wo-014468318 po119462189 0645244017 neovia 3930 cedar creek dr joliet , il . 60436 larry heagle  1815-768-3282 alt: 779-702-0498 jc9


------------------------- acutal entities -------------------------
('jc31-00163b', 44, 55, 'PART_NUM')
------------------------- predict entities -------------------------
('jc31-00163b', 44, 55, 'PART_NUM')
('1815-768-3282', 345, 358, 'PART_NUM')
('779-702-0498', 364, 376, 'PART_NUM')
shloime called to request reschedule . please order parts rm2-6456-010cn/top cover - rm2-7914-000cn/lv psu to fedex hal 51 20th st brooklyn accordingly . thanks 521743302818 to 51 20 lifter drive part delivery ready mid day logistics 15min error msg and


------------------------- predict entities -------------------------
('rm2-6456-010cn', 58, 72, 'PART_NUM')
('rm2-7914-000cn', 85, 99, 'PART_NUM')
please order part number f2g76-67901as issue has returned . 


------------------------- predict entities -------------------------
('f2g76-67901as', 25, 38, 'PART_NUM')
please order part: rm2-9335-000cn            high-voltage power supply ship to hfpu  ( 83004794 ) 16241 s farrell road , lockport , il . 60441 set for friday . . m0p33a-cnmclb10h3-hp laserjet managed e60055dn prntr case: 5069082646 wo-014513867 po119522639 0645270000 international paper 4160 campus dr aurora , il . 60504-4172 krystal brown 1816-351-283 krystal . brown@ipaper . com rm2-0866-020cn assy-cassette


------------------------- acutal entities -------------------------
('rm2-0866-020cn', 384, 398, 'PART_NUM')
------------------------- predict entities -------------------------
('60504-4172', 317, 327, 'PART_NUM')
('1816-351-283', 342, 354, 'PART_NUM')
('rm2-0866-020cn', 384, 398, 'PART_NUM')
#normal please order part # rm2-5452-000 . please send this call to doug houseknecht


------------------------- predict entities -------------------------
('rm2-5452-000', 28, 40, 'PART_NUM')
please order the following part to the customer site attn: jim cox l41606-011 hard drive . please order these additional parts to to the portage hold for pick up . l4606-011 hard drive 5851-7913 formatter . please order this one again as the 1st one is lost at fedex . 


------------------------- acutal entities -------------------------
('l41606-011', 67, 77, 'PART_NUM')
('5851-7913', 185, 194, 'PART_NUM')
------------------------- predict entities -------------------------
('l41606-011', 67, 77, 'PART_NUM')
('l4606-011', 164, 173, 'PART_NUM')
('5851-7913', 185, 194, 'PART_NUM')
please send parts to 800 main st . paterson nj 07503 attn: nick lavcanski ( 973 ) 202-1231 jc98-05425a qty:2


------------------------- acutal entities -------------------------
('jc98-05425a', 91, 102, 'PART_NUM')
------------------------- predict entities -------------------------
('202-1231', 82, 90, 'PART_NUM')
('jc98-05425a', 91, 102, 'PART_NUM')
please fedex a t030-67901 , t0b28-67901 and a t0b27-67901 to: 16633 schoenborn st north hills , ca 91343


------------------------- acutal entities -------------------------
('t0b28-67901', 28, 39, 'PART_NUM')
('t0b27-67901', 46, 57, 'PART_NUM')
------------------------- predict entities -------------------------
('t030-67901', 15, 25, 'PART_NUM')
('t0b28-67901', 28, 39, 'PART_NUM')
('t0b27-67901', 46, 57, 'PART_NUM')
please add these parts and order replacements to my trunk stock: either one each: pn rm2-1275-000 pn rm2-6772_000 or one pnj7z98-67902 . wells fargo 07/20/2021 9:32 pm cst - parts confirmed as shipped . closed the ticket . 07/16/2021 8:51 pm cst - call is completed - on break until my replacement trunk stock ships and i have the tracking numbers . 07/16/2021 4:48 pm cst *fixed* tray 2 jamming


------------------------- predict entities -------------------------
('rm2-1275-000', 85, 97, 'PART_NUM')
lex-40x-0100 , 2 , c2etes26


------------------------- predict entities -------------------------
('lex-40x-0100', 0, 12, 'PART_NUM')


### Some Features and Issues

#### Miss some part-numbers 

In [122]:
sample_parts = ['M14302-161', '926537-001', 'M46915-D01', '140314-FB2', '176253-8B0', 
               '442010-ABC', 'L07896-DB1', '168757-B21', 'FE-15520-01', 'RM1-7867-000CN', 
               '5066-4719', '5066-3872-1', '3HZ90-30001', 'T6M22-30001', '98544-04105', 
               'C8109-69018', 'L26480-DB1', '632427-001B', 'CN463-69003', 'RM1-1298BULK', 
               '8-752-078-46', 'X-4035-119-1', 'TBLB3002-A04', '4822-701-15319', '08-2212C-33W', 
               '19-40065-011', 'ATT09850-66532', '98564-66572-SQ', '897-250144AA', '738-576-94', 
               '1FQ6-0001', '1818-7469-LGSA', '1820-6361M', '1818-6868-SAM', '3138-107-96160', 
               '686858R-999', 'MD30C-A2', '29-26196-00', 'DEL-7D092', 'LEX-40X2384', 'LEX-40X2831H', 
               'SAM-6107001172', 'SHA-1625DS51', 'XER-006R01275', 'BRO-TN115BK', 'DEL-310-5811', 
               'DEL-PK496', 'KYO-1702F97US0', 'LA95-CA', 'KYO-DK-310H', 'EPS-C31CD38A9921', 
               'KYO-F994091H', 'EPS-CEPS-003G', 'LEX-24015SA', 'LEX-62D1X00', 'LEX-72K0DV0', 
               'LEX-801HC', 'LEX-C5220CSH', 'TRO-78-24619-001', 'LEX-C544X1CG', 'HPE-749797-001', 
               'SFT-P-895', 'LEX-E260A21A-C', 'RIC-D0296509', 'SEA-ST1000LM049', 'LEX-12A8400-PC', 
               'HPE-JL258A', 'RIC-400507', 'FUJ-FPCPR362AQ', 'TOS-6LE8296100', 'XER-003K04980', 
               'XER-116111500', 'ATS-2711FXSC-901', 'BRO-TN336BK', 'DEL-310-5807', 'HPI-CF258A-M', 
               'LEX-T101-0000000', 'DEL-9PN5P', 'SAM-BA92-08880A', 'APC-RBC7', 'KMH-4448-121', 
               'SXC-C7ABTTAAB', 'TRO-04621201', 'VEF-28924-04-R', 'WFP-HPM527Z', 'KYO-1702LK0UN2', 
               'APC-SUA1500RM2U', 'KYO-302NM18021', 'HPE-JH329-61001', 'KYO-1T02GA0US0', 'LEB-04X4674',
               'KYO-302GR93034', 'DEL-V1RX3', 'LEX-40X0593', 'TOS-6LH3460800', 'RIC-AE020171', 
               'XER-59K60140', 'BRO-LM5140001', 'KMH-A00JA56600', 'KYO-302F906240', 'BRO-LEM084001', 
               'XER-116-2035', 'BRO-BU100CLH', '3X-LK465-A2', '4G1-3999-030CN', 'H3980-60002BULK', 
               'SHA-VHI0MFP000', 'TOS-C017839000', 'JC66-01190A', 'XER-3335DNI', 'XER-3635MFP-S', 
               'XER-4622DN', 'XER-7750-Z2', 'DEL-D1320-W3', 'DEL-P637D', 'LEX-C540X35G', 
               'SHA-CPLTM8190F', 'SHACFRM-1380DS53', 'XER-401-0855-0', '90-0077', 'BRO-LF6710001', 
               'Q3938-68001-PCA', 'C314X-49A-90002', '20ER-84530KC', 'RM2-2903-BULK', 'CE506-67919-BU', 
               'CH971-91596', 'DEL-FM235', 'JC44-00210E', 'SHACFRM-1509DS52', 'SHADHAI-5043FCPZ', 
               'SHAGCAB-1102FCB1', '000-50-02-008', '504000226-DD', 'SHAVHPLG217L5A-1', 'SHAVHPOSH03Y02-1', 
               'SHADUNT-8962FCP1', 'SHAKI-OK0010FCPZ', 'SHAQSW-P0506FCZZ', 'SHAVHPGP1A73AR-1', 'SHARH-IX0055FCPZ', 
               '358-000001', 'SHAVHPGP1A71L3-18', 'B23-0994', '504000226-D', '5066-0731', 
               'E0-CABLE-01', 'RM1-8508-010-BU', 'B3M77-67902-BU', 'CA02626-E029FJ', 'CF065-67901-BU', 
               'C2H57-67901-BULK', 'LN08-A2', '693709-001B', 'SS467-67001', 'LEX-70C0D20', 
               'SHAQSW-M0518FCPZ', 'SHALX-BZ0994FCPZ', 'LN08X-TF', 'SHAGCAB-1507FCAZ']

sample_parts = [part.lower() for part in sample_parts]

In [123]:
def get_texts(parts, standardize_text=standardize_text1):
    texts = [standardize_text(f'Please order qty 1 {part} NBD to my HAL in Lenexa. Thank you.') for part in parts]
    return texts

def check_sample_parts(parts):  
    texts = get_texts(parts)
    docs = nlp.pipe(texts)
    texts = []
    error_texts = []
    errpr_parts = []
    for doc, part in zip(docs, parts):
        texts.append(doc.text)
        if len(doc.ents) !=1 or doc.ents[0].text != part.lower():
            error_texts.append(doc.text)
            errpr_parts.append(part)
    return texts, error_texts, errpr_parts
        
parts = ['000-50-02-008']
texts, error_texts, errpr_parts = check_sample_parts(parts)
show_texts(texts, show_detail=True) 

please order qty 1 000-50-02-008 nbd to my hal in lenexa . thank you . 


------------------------- predict entities -------------------------
('000-50-02-008', 19, 32, 'PART_NUM')


In [124]:
parts = sample_parts
texts, error_texts, errpr_parts = check_sample_parts(parts)
print(f'find {len(error_texts)}/{len(parts)} errors')
show_texts(error_texts, show_detail=True)  

find 1/160 errors
please order qty 1 lex-c544x1cg nbd to my hal in lenexa . thank you . 


------------------------- predict entities -------------------------
('lex-c544x1cg nbd', 19, 35, 'PART_NUM')


In [45]:
error_text_entities, error_pattern_matchs = generate_entity(error_texts, patterns=part_num_patterns)
show_entity(error_text_entities, error_pattern_matchs)

--------------------------------------------------
please order qty 1 lex-c544x1cg nbd to my hal in lenexa . thank you . 
('lex-c544x1cg', 19, 31, 'PART_NUM')
--------------------------------------------------
please order qty 1 c314x+49a-90002 nbd to my hal in lenexa . thank you . 



I think that the issues are mainly caused by lacking of these parts in training data. 

There are 2 solutions:

1. merge the results of regular expressions and the model.
2. use data augmentation to generate a lot of random part numbers based on the rules.

####  Find potentail part number

In [48]:
parts = ['FE-15520-02', 'FE-15520-012']
texts, error_texts, errpr_parts = check_sample_parts(parts)
print(f'find {len(error_texts)}/{len(parts)} errors')
show_texts(texts)

find 0/2 errors
please order qty 1 fe-15520-02 nbd to my hal in lenexa . thank you . 


------------------------- predict entities -------------------------
('fe-15520-02', 19, 30, 'PART_NUM')
please order qty 1 fe-15520-012 nbd to my hal in lenexa . thank you . 


------------------------- predict entities -------------------------
('fe-15520-012', 19, 31, 'PART_NUM')


In [42]:
some_text_entities, some_pattern_matchs = generate_entity(texts, patterns=part_num_patterns)
show_entity(some_text_entities, some_pattern_matchs)

--------------------------------------------------
please order qty 1 fe-15520-02 nbd to my hal in lenexa . thank you . 
('fe-15520-02', 19, 30, 'PART_NUM')
--------------------------------------------------
please order qty 1 fe-15520-012 nbd to my hal in lenexa . thank you . 



## Output Results

In [77]:
texts = [standardize_text1(note) for note in df_case.notes]
# nlp.pipe(texts) is better, but can't see the progress
docs = [nlp(text) for text in tqdm(texts)]

100%|██████████| 7039/7039 [07:31<00:00, 15.59it/s]


In [84]:
def get_part_nums(docs):
    part_nums = []
    for doc in tqdm(docs):
        doc_part_nums = [ent.text.upper() for ent in doc.ents if  ' ' not in ent.text]
        part_nums.append(', '.join(doc_part_nums))            
    return part_nums 

part_nums = get_part_nums(docs)
df_case['Part_Number'] = part_nums

100%|██████████| 7039/7039 [00:00<00:00, 146778.22it/s]


In [86]:
print(len(part_nums))
print(part_nums[14], sep='\n') 
df_case.loc[15]['Part_Number']

7039
CH538-67076, T8W16-67006, CH538-67018, CK839-67005, CN727-67023, W6B55-67007


'CH538-67076, T8W16-67006, CH538-67018, CK839-67005, CN727-67023, W6B55-67007'

In [93]:
def fill_common_words(texts, df, patterns=common_words_patterns):
    expressions = []
    for pattern in tqdm(patterns):
        expression = pattern['expression']  
        expressions.append(expression)
        match_counts = []
        
        for text in texts:
            match_count = 0
            for match in re.finditer(expression, text):
                start, end = match.span()
                add_key = True
                if start-1 >= 0 and text[start-1]!=' ':
                    add_key = False
                if end < len(text) and text[end]!=' ':
                    add_key = False                 
                if add_key: 
                    match_count = match_count + 1 
            match_counts.append(' ' if match_count==0 else str(match_count))            
        df[expression] = match_counts    
    return df[['notes'] + expressions] 

df_common_words = fill_common_words(texts, df_case)
df_common_words

100%|██████████| 19/19 [00:00<00:00, 89.61it/s]


Unnamed: 0,notes,order,recommended,hfpu,ship,fedex,hal,site,send,location,pickup,sent,requested,reorder,street,hfp,shipped,delivery,recommend,warehouse
1,#NORMAL Part escalated,,,,,,,,,,,,,,,,,,,
2,Please cancel backordered part. Customer has c...,,,,,,,,,,,,,,,,,,,
3,According to bryan this laptop shows repair co...,,,,,,,,,,,,,,,,,,,
4,Back order part,1,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7035,Please order the following part. Qty. 1 B5L04...,1,,,,,,,,,,,,,,,,,,
7036,Please reassign to lms,,,,,,,,,,,,,,,,,,,
7037,dahlia.garcia@hp.com,,,,,,,,,,,,,,,,,,,
7038,Please order 842275-001 and have it ship to HF...,1,,1,1,1,,,,,,,,,,,,,,


In [96]:
df_case.to_excel('./data/df_case.xlsx', sheet_name='notes')

In [97]:
some_texts = ['please order part# j8j93–67901 nbd PART_NUM to address on work order']
some_text_entities, some_pattern_matchs = generate_entity(some_texts, patterns=part_num_patterns)
some_text_entities

[('please order part# j8j93–67901 nbd PART_NUM to address on work order',
  {'entities': []})]

In [101]:
print(*[pattern['expression'] for pattern in part_num_patterns], sep='\n')

[a-zA-Z]\d\d\d\d\d-\d\d\d
\d\d\d\d\d\d-\d\d\d
[a-zA-Z]\d\d\d\d\d-[a-zA-Z]\d\d
\d\d\d\d\d\d-[a-zA-Z][a-zA-Z]\d
\d\d\d\d\d\d-\d[a-zA-Z]\d
\d\d\d\d\d\d-[a-zA-Z][a-zA-Z][a-zA-Z]
[a-zA-Z]\d\d\d\d\d-[a-zA-Z][a-zA-Z]\d
\d\d\d\d\d\d-[a-zA-Z]\d\d
[a-zA-Z][a-zA-Z]-\d\d\d\d\d-\d\d
[a-zA-Z][a-zA-Z]\d-\d\d\d\d-\d\d\d[a-zA-Z][a-zA-Z]
\d\d\d\d-\d\d\d\d
\d\d\d\d-\d\d\d\d-\d
\d[a-zA-Z][a-zA-Z]\d\d-\d\d\d\d\d
[a-zA-Z]\d[a-zA-Z]\d\d-\d\d\d\d\d
\d\d\d\d\d-\d\d\d\d\d
[a-zA-Z]\d\d\d\d-\d\d\d\d\d
\d\d\d\d\d\d-\d\d\d[a-zA-Z]
[a-zA-Z][a-zA-Z]\d\d\d-\d\d\d\d\d
[a-zA-Z][a-zA-Z]\d-\d\d\d\d[a-zA-Z][a-zA-Z][a-zA-Z][a-zA-Z]
\d-\d\d\d-\d\d\d-\d\d
[a-zA-Z]-\d\d\d\d-\d\d\d-\d
[a-zA-Z][a-zA-Z][a-zA-Z][a-zA-Z]\d\d\d\d-[a-zA-Z]\d\d
\d\d\d\d-\d\d\d-\d\d\d\d\d
\d\d-\d\d\d\d[a-zA-Z]-\d\d[a-zA-Z]
\d\d-\d\d\d\d\d-\d\d\d
[a-zA-Z][a-zA-Z][a-zA-Z]\d\d\d\d\d-\d\d\d\d\d
\d\d\d\d\d-\d\d\d\d\d-[a-zA-Z][a-zA-Z]
\d\d\d-\d\d\d\d\d\d[a-zA-Z][a-zA-Z]
\d\d\d-\d\d\d-\d\d
\d[a-zA-Z][a-zA-Z]\d-\d\d\d\d
\d\d\d\d-\d\d\d\d-[a-zA-Z][a-zA-Z][a-zA-

In [111]:
re.findall(r'[a-zA-Z]\d[a-zA-Z]\d\d-\d\d\d\d\d', some_texts[0])

[]

In [120]:
re.findall(r'[a-zA-Z]\d[a-zA-Z]\d\d–\d\d\d\d\d', some_texts[0]) 

['j8j93–67901']

In [106]:
some_texts[0]

'please order part# j8j93–67901 nbd PART_NUM to address on work order'

In [121]:
'-' == '–'

False