    Restore the extracted event text from the raw sentence, e.g. event text: 'the chocolate be the best', restore text: 'the chocolate are the best', raw sentence text: 'however the chocolate are the best'.

    Our goal is finding ASER event's corresponding sub-sentence. Restore strategy is matching head and tail words, and also checking if there exist a proportionate number of overlapped words.

    Through continuous iterations with error analysis and adding rules, the final matching success rate is over 95%.


# Config

In [79]:
import joblib
import tqdm
import random
from textblob import TextBlob
from mlconjug3 import Conjugator
import re

# get verb tense

def get_verb_forms(conjugator, verb_word: str):
    """
    compute all forms of a verb.

    :param
    :return: tuple, (present, third-person present, past, present continuous, present perfect)
    """
    present, third_person_present, past, present_continuous, present_perfect = '', '', '', '', ''

    try:
        for form in conjugator.conjugate(verb_word):
            if len(form)==4:
                _, tense, sub, verb = form
                if sub in ('we','they','you'):
                    continue
                if tense == 'indicative present' and sub == 'I':
                    present = verb
                if tense == 'indicative present' and sub == 'he/she/it':
                    third_person_present = verb
                if tense == 'indicative past tense' and sub == 'I':
                    past = verb
                if tense == 'indicative present continuous' and sub == 'I':
                    present_continuous = verb
                if tense == 'indicative present perfect' and sub == 'I':
                    present_perfect = verb
            else: # 3
                continue
        stat = 0
    except:
        #print(traceback.format_exc(()))
        stat = -1

    return (present, third_person_present, past, present_continuous, present_perfect), stat

source_dir='./'

conjugator = Conjugator(language='en')

In [2]:
VERSION = 'v2'

aser = joblib.load(source_dir+'Data/amazon_food_review_aser_event_0_100000.%s'%VERSION)

In [5]:
aser[129784]

{'sentences': ['I sent these as part of my donation to our service members and I have gotten back some very positive thanks for this thoughtful gift',
  "It's no secret that Coffee is a favorite drink in military deployed environments and just about everybody sends different types of coffee to our men and women in uniform",
  'I wanted to think outside the box and send something a bit different so I sent the best coffee accompanying treat and this was the big hit',
  'I am not reviewing this item from how much I like it, I am reviewing this item on how many service members this treat helped',
  'A little goes a long way and I say this went a long way',
  "Plus you can't beat the price with free shipping to our service members"],
 'aser': [[i have get back some positive thanks for gift],
  [it be no secret,
   coffee be a favorite drink in environment just,
   coffee send different type of coffee man woman],
  [i send],
  [i be not review this item,
   i like it,
   i be review this ite

In [6]:
# event_sent_map={} # {event:sentence} 
# for text_id, info in aser.items():
#     for s,event_list in zip(info['sentences'],info['aser']):
#         for e in event_list:
#             event_sent_map[e]=s

# event might appear twice in aser result. Use list not dict

event_sent_list=[] # (event,sentence)
for text_id, info in aser.items():
    for s,event_list in zip(info['sentences'],info['aser']):
        for e in event_list:
            event_sent_list.append((text_id,e,s))

In [7]:
len(event_sent_list)

727023

# UnitTest

In [11]:
# # unit test
# event_text=aser[534543]['aser'][0][1].__repr__()
# sent=aser[534543]['sentences'][0]

# hw=event_text.split(' ')[0]
# tw=event_text.split(' ')[-1]

# start_ind=0
# end_ind=-1
# for i,w in enumerate(sent.split(' ')):
#     if w==hw:
#         start_ind=i
#     if w==tw:
#         end_ind=i

# event_raw=' '.join(sent.split(' ')[start_ind:end_ind+1])

# print(event_raw)

In [28]:
event_text='now i be glad'
sent="I thought I was buying less than I was, but now I'm glad I got more than I thought because now I'm going to make all my own flatbreads"

for i in range(1):
    hw=event_text.split(' ')[0]
    hw_pos=event.pos_tags[0]
    tw=event_text.split(' ')[-1]
    tw_pos=event.pos_tags[-1]

    start_ind=0
    end_ind=-1
    for i,sw in enumerate(sent.split(' ')):
        # (1) head word and tail word matching
        if sw==hw:
            start_ind=i
        if sw==tw:
            end_ind=i
        # (2) upper/lower case matching       
        if sw.lower()==hw:
            start_ind=i
        if sw.lower()==tw:
            end_ind=i
        # (3) plural matching
        if 'NN' in hw_pos:
            blob = TextBlob(hw)
            tmp=[word.pluralize() for word in blob.words]
            hw_plural = tmp[0] if len(tmp)>0 else ''
            if sw==hw_plural:
                start_ind=i
        if 'NN' in tw_pos:
            blob = TextBlob(tw)
            tmp=[word.pluralize() for word in blob.words]
            tw_plural = tmp[0] if len(tmp)>0 else ''
            if sw==tw_plural:
                end_ind=i
                
        # (4) verb-tense matching
        if 'VB' in hw_pos and hw in verb_tense:
            tmp=verb_tense[hw]
            for t in tmp:
                if sw==t:
                    start_ind=i
                    break
        if 'VB' in tw_pos and tw in verb_tense:
            tmp=verb_tense[tw]
            for t in tmp:
                if sw==t:
                    end_ind=i
                    break

    event_raw=' '.join(sent.split(' ')[start_ind:end_ind+1])


print(event_raw)




# Loop

## Config

In [8]:
# get all verbs from events
verbs=set()

for text_id,event, sent in tqdm.tqdm(event_sent_list):

    for w, pt in zip(event.__repr__().split(' '),event.pos_tags):

        if 'VB' in pt:

            verbs.add(w)

verbs=list(verbs)
print(len(verbs))

100%|██████████| 727023/727023 [00:01<00:00, 415393.78it/s]

5155





In [9]:
verb_tense={}
fail=[]
for v in tqdm.tqdm(verbs):
    tmp,status=get_verb_forms(conjugator,v)
    if status==-1:
        fail.append(v)
    else:
        verb_tense[v]=tmp
print(len(fail))

100%|██████████| 5155/5155 [00:24<00:00, 208.94it/s]

2589





    From fail cases, we find most of them have typo-errors. No need to bother.

In [50]:
personal_pronoun={
    'i': ['i','I','me'],
    'you': ['you','your'],
    'he': ['he','him','his'],
    'she': ['she','her'],
    'it': ['it','its'],
    'we': ['we','us','our'],
    'they': ['they','them','their'],
}

In [61]:
def clean(sentence:str):
    res=sentence.replace("'ve",' have').replace("'ll",' will').replace("'re",' are').replace("'m",' am').replace("'d",' would').replace("n't",' not')

    res=res.replace("he's",'he has').replace("she's",'she has').replace("it's",'it is')

    res=res.replace("He's",'He has').replace("She's",'She has').replace("It's",'It is')

    res=res.replace("<br />",' ').replace("<br/>",' ')

    res=res.replace("(",' ').replace(")",' ') # this apple (brand is blablabla)

    res=res.replace(";",'.').replace(":",'.') # E.g.: I was excited to find these two teas because they are the perfect compromise: just enough caffeine to keep me going, but not enough to make my heart pound
    
    res=res.replace('"',' ') # E.g.: I do not care for the "Creamy Peanut Butter"

    return res

## Round1

In [15]:
# 72.3%

event_processed=[] # (event, raw_event_text, sent)
event_unprocessed=[] # (event,'', sent)

for text_id, event, sent in tqdm.tqdm(event_sent_list[:30]):
    event_text=event.__repr__()
    hw=event_text.split(' ')[0]
    hw_pos=event.pos_tags[0]
    tw=event_text.split(' ')[-1]
    tw_pos=event.pos_tags[-1]

    start_ind=0
    end_ind=-1
    sub_sents=sent.split(',')
    for ss in sub_sents:
        ss_cut=ss.split(' ')
        for i,sw in enumerate(ss_cut):
            # (1) head word and tail word matching
            if sw==hw:
                start_ind=i
            if sw==tw:
                end_ind=i
            # (2) upper/lower case matching       
            if sw.lower()==hw:
                start_ind=i
            if sw.lower()==tw:
                end_ind=i
            # (3) plural matching
            if 'NN' in hw_pos:
                blob = TextBlob(hw)
                tmp=[word.pluralize() for word in blob.words]
                hw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==hw_plural:
                    start_ind=i
            if 'NN' in tw_pos:
                blob = TextBlob(tw)
                tmp=[word.pluralize() for word in blob.words]
                tw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==tw_plural:
                    end_ind=i
            # (4) verb-tense matching
            if 'VB' in hw_pos and hw in verb_tense:
                tmp=verb_tense[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        break
            if 'VB' in tw_pos and tw in verb_tense:
                tmp=verb_tense[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        break
            


        event_raw_text=' '.join(ss_cut[start_ind:end_ind+1])

        if len(event_raw_text)>0:
            event_processed.append((text_id, event,event_raw_text,sent))
            break # no need to search next sub-sentence
    if len(event_raw_text)==0:
        event_unprocessed.append((text_id, event,'',sent))


# tracking
print("Total events: %d, processed events: %d, ratio: %.2f%%"%(len(event_sent_map), len(event_processed), 100*len(event_processed)/len(event_sent_map)))

print("Unprocessed Samples: ")

# __tmp=random.sample([(t,e,s) for t,e,_,s in event_unprocessed],10)
# for t,e,s in __tmp:
#     print("textID %s: %s == %s"%(t,e.__repr__(),s))
#     print('\n')

100%|██████████| 30/30 [00:00<00:00, 857.84it/s]

Total events: 727023, processed events: 24, ratio: 0.00%
Unprocessed Samples: 





## Round2

In [None]:
# upgrade the final matching mechanism

# update word-matching rule: personal pronoun;

event_processed=[] # (event, raw_event_text, sent)
event_unprocessed=[] # (event,'', sent)

for text_id, event, sent in tqdm.tqdm(event_sent_list[:1000]):
    event_text=event.__repr__()
    hw=event_text.split(' ')[0]
    hw_pos=event.pos_tags[0]
    tw=event_text.split(' ')[-1]
    tw_pos=event.pos_tags[-1]

    sub_sents=sent.split(',')

    match_res=[]

    for ss in sub_sents:
        ss_cut=ss.split(' ')

        start_inds=set()
        end_inds=set()

        for i,sw in enumerate(ss_cut):
            # (1) head word and tail word matching
            if sw==hw:
                start_ind=i
                start_inds.add(start_ind)
            if sw==tw:
                end_ind=i
                end_inds.add(end_ind)
            # (2) upper/lower case matching       
            if sw.lower()==hw:
                start_ind=i
                start_inds.add(start_ind)

            if sw.lower()==tw:
                end_ind=i
                end_inds.add(end_ind)

            # (3) plural matching
            if 'NN' in hw_pos:
                blob = TextBlob(hw)
                tmp=[word.pluralize() for word in blob.words]
                hw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==hw_plural:
                    start_ind=i
                    start_inds.add(start_ind)

            if 'NN' in tw_pos:
                blob = TextBlob(tw)
                tmp=[word.pluralize() for word in blob.words]
                tw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==tw_plural:
                    end_ind=i
                    end_inds.add(end_ind)

            # (4) verb-tense matching
            if 'VB' in hw_pos and hw in verb_tense:
                tmp=verb_tense[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        start_inds.add(start_ind)

            if 'VB' in tw_pos and tw in verb_tense:
                tmp=verb_tense[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        end_inds.add(end_ind)

            # (5) personal pronoun matching
            if hw in personal_pronoun:
                tmp=personal_pronoun[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        start_inds.add(start_ind)

            if tw in personal_pronoun:
                tmp=personal_pronoun[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        end_inds.add(end_ind)

        
        match_res.append((ss_cut, start_inds, end_inds))

    # determine the final start and end indices
    # loop over all possible combinations of start and end indices and find the one with the maximum overlap inside
    event_inner_cut=event_text.split(' ')[1:-1]
    start_ind=-1
    end_ind=-1
    best_ss_cnt=[]
    inner_match_cnt=0
    for ss_cut, start_inds, end_inds in match_res:
        for s in start_inds:
            for e in end_inds:
                if s<e:
                    cnt=0
                    for _x in event_inner_cut:
                        if _x in ss_cut[s+1:e]:
                            cnt+=1
                    if cnt>=inner_match_cnt:
                        inner_match_cnt=cnt
                        start_ind=s
                        end_ind=e
                        best_ss_cnt=ss_cut

    event_raw_text=' '.join(best_ss_cnt[start_ind:end_ind+1])

    if len(event_raw_text)>0:
        event_processed.append((text_id, event, event_raw_text, sent))
    if len(event_raw_text)==0:
        event_unprocessed.append((text_id, event,'',sent))


# tracking
print("Total events: %d, processed events: %d, ratio: %.2f%%"%(len(event_sent_map), len(event_processed), 100*len(event_processed)/len(event_sent_map)))

print("Unprocessed Samples: ")

__tmp=random.sample([(t,e,s) for t,e,_,s in event_unprocessed],10)
for t,e,s in __tmp:
    print("textID %s: %s == %s"%(t,e.__repr__(),s))
    print('\n')

## Round3

In [None]:
# update: clean sentence

event_processed=[] # (event, raw_event_text, sent)
event_unprocessed=[] # (event,'', sent)

for text_id, event, sent in tqdm.tqdm(event_sent_list[:1000]):
    event_text=event.__repr__()
    hw=event_text.split(' ')[0]
    hw_pos=event.pos_tags[0]
    tw=event_text.split(' ')[-1]
    tw_pos=event.pos_tags[-1]

    sub_sents=clean(sent).split(',')

    match_res=[]

    for ss in sub_sents:
        ss_cut=ss.split(' ')

        start_inds=set()
        end_inds=set()

        for i,sw in enumerate(ss_cut):
            sw=sw.lower()

            # (1) head word and tail word matching
            if sw==hw:
                start_ind=i
                start_inds.add(start_ind)
            if sw==tw:
                end_ind=i
                end_inds.add(end_ind)
            # (2) upper/lower case matching       
            # if sw.lower()==hw:
            #     start_ind=i
            #     start_inds.add(start_ind)

            # if sw.lower()==tw:
            #     end_ind=i
            #     end_inds.add(end_ind)

            # (3) plural matching
            if 'NN' in hw_pos:
                blob = TextBlob(hw)
                tmp=[word.pluralize() for word in blob.words]
                hw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==hw_plural:
                    start_ind=i
                    start_inds.add(start_ind)

            if 'NN' in tw_pos:
                blob = TextBlob(tw)
                tmp=[word.pluralize() for word in blob.words]
                tw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==tw_plural:
                    end_ind=i
                    end_inds.add(end_ind)

            # (4) verb-tense matching
            if 'VB' in hw_pos and hw in verb_tense:
                tmp=verb_tense[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        start_inds.add(start_ind)

            if 'VB' in tw_pos and tw in verb_tense:
                tmp=verb_tense[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        end_inds.add(end_ind)

            # (5) personal pronoun matching
            if hw in personal_pronoun:
                tmp=personal_pronoun[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        start_inds.add(start_ind)

            if tw in personal_pronoun:
                tmp=personal_pronoun[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        end_inds.add(end_ind)

        
        match_res.append((ss_cut, start_inds, end_inds))

    # determine the final start and end indices
    # loop over all possible combinations of start and end indices and find the one with the maximum overlap inside
    event_inner_cut=event_text.split(' ')[1:-1]
    start_ind=-1
    end_ind=-1
    best_ss_cnt=[]
    inner_match_cnt=0
    for ss_cut, start_inds, end_inds in match_res:
        for s in start_inds:
            for e in end_inds:
                if s<e:
                    cnt=0
                    for _x in event_inner_cut:
                        if _x in ss_cut[s+1:e]:
                            cnt+=1
                    if cnt>=inner_match_cnt:
                        inner_match_cnt=cnt
                        start_ind=s
                        end_ind=e
                        best_ss_cnt=ss_cut

    event_raw_text=' '.join(best_ss_cnt[start_ind:end_ind+1])

    if len(event_raw_text)>0:
        event_processed.append((text_id, event, event_raw_text, sent))
    if len(event_raw_text)==0:
        event_unprocessed.append((text_id, event,'',sent))


# tracking
print("Total events: %d, processed events: %d, ratio: %.2f%%"%(len(event_sent_map), len(event_processed), 100*len(event_processed)/len(event_sent_map)))

print("Unprocessed Samples: ")

__tmp=random.sample([(t,e,s) for t,e,_,s in event_unprocessed],10)
for t,e,s in __tmp:
    print("textID %s: %s == %s"%(t,e.__repr__(),s))
    print('\n')

## Round4

In [None]:
# update: process ;:

event_processed=[] # (event, raw_event_text, sent)
event_unprocessed=[] # (event,'', sent)

for text_id, event, sent in tqdm.tqdm(event_sent_list[:1000]):
    event_text=event.__repr__()
    hw=event_text.split(' ')[0]
    hw_pos=event.pos_tags[0]
    tw=event_text.split(' ')[-1]
    tw_pos=event.pos_tags[-1]

    sub_sents=re.split('\.|,', clean(sent))

    match_res=[]

    for ss in sub_sents:
        ss_cut=ss.split(' ')

        start_inds=set()
        end_inds=set()

        for i,sw in enumerate(ss_cut):
            sw=sw.lower()

            # (1) head word and tail word matching
            if sw==hw:
                start_ind=i
                start_inds.add(start_ind)
            if sw==tw:
                end_ind=i
                end_inds.add(end_ind)
            # (2) upper/lower case matching       
            # if sw.lower()==hw:
            #     start_ind=i
            #     start_inds.add(start_ind)

            # if sw.lower()==tw:
            #     end_ind=i
            #     end_inds.add(end_ind)

            # (3) plural matching
            if 'NN' in hw_pos:
                blob = TextBlob(hw)
                tmp=[word.pluralize() for word in blob.words]
                hw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==hw_plural:
                    start_ind=i
                    start_inds.add(start_ind)

            if 'NN' in tw_pos:
                blob = TextBlob(tw)
                tmp=[word.pluralize() for word in blob.words]
                tw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==tw_plural:
                    end_ind=i
                    end_inds.add(end_ind)

            # (4) verb-tense matching
            if 'VB' in hw_pos and hw in verb_tense:
                tmp=verb_tense[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        start_inds.add(start_ind)

            if 'VB' in tw_pos and tw in verb_tense:
                tmp=verb_tense[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        end_inds.add(end_ind)

            # (5) personal pronoun matching
            if hw in personal_pronoun:
                tmp=personal_pronoun[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        start_inds.add(start_ind)

            if tw in personal_pronoun:
                tmp=personal_pronoun[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        end_inds.add(end_ind)

        
        match_res.append((ss_cut, start_inds, end_inds))

    # determine the final start and end indices
    # loop over all possible combinations of start and end indices and find the one with the maximum overlap inside
    event_inner_cut=event_text.split(' ')[1:-1]
    start_ind=-1
    end_ind=-1
    best_ss_cnt=[]
    inner_match_cnt=0
    for ss_cut, start_inds, end_inds in match_res:
        for s in start_inds:
            for e in end_inds:
                if s<e:
                    cnt=0
                    for _x in event_inner_cut:
                        if _x in ss_cut[s+1:e]:
                            cnt+=1
                    if cnt>=inner_match_cnt:
                        inner_match_cnt=cnt
                        start_ind=s
                        end_ind=e
                        best_ss_cnt=ss_cut

    event_raw_text=' '.join(best_ss_cnt[start_ind:end_ind+1])

    if len(event_raw_text)>0:
        event_processed.append((text_id, event, event_raw_text, sent))
    if len(event_raw_text)==0:
        event_unprocessed.append((text_id, event,'',sent))


# tracking
print("Total events: %d, processed events: %d, ratio: %.2f%%"%(len(event_sent_map), len(event_processed), 100*len(event_processed)/len(event_sent_map)))

print("Unprocessed Samples: ")

__tmp=random.sample([(t,e,s) for t,e,_,s in event_unprocessed],10)
for t,e,s in __tmp:
    print("textID %s: %s == %s"%(t,e.__repr__(),s))
    print('\n')

    96.6% covering rate, good enough.

# Pipeline

In [None]:
import joblib
import tqdm
import random
from textblob import TextBlob
from mlconjug3 import Conjugator
import re

# get verb tense

def get_verb_forms(conjugator, verb_word: str):
    """
    compute all forms of a verb.

    :param
    :return: tuple, (present, third-person present, past, present continuous, present perfect)
    """
    present, third_person_present, past, present_continuous, present_perfect = '', '', '', '', ''

    try:
        for form in conjugator.conjugate(verb_word):
            if len(form)==4:
                _, tense, sub, verb = form
                if sub in ('we','they','you'):
                    continue
                if tense == 'indicative present' and sub == 'I':
                    present = verb
                if tense == 'indicative present' and sub == 'he/she/it':
                    third_person_present = verb
                if tense == 'indicative past tense' and sub == 'I':
                    past = verb
                if tense == 'indicative present continuous' and sub == 'I':
                    present_continuous = verb
                if tense == 'indicative present perfect' and sub == 'I':
                    present_perfect = verb
            else: # 3
                continue
        stat = 0
    except:
        #print(traceback.format_exc(()))
        stat = -1

    return (present, third_person_present, past, present_continuous, present_perfect), stat

source_dir='./'

conjugator = Conjugator(language='en')

personal_pronoun={
    'i': ['i','I','me'],
    'you': ['you','your'],
    'he': ['he','him','his'],
    'she': ['she','her'],
    'it': ['it','its'],
    'we': ['we','us','our'],
    'they': ['they','them','their'],
}

def clean(sentence:str):
    res=sentence.replace("'ve",' have').replace("'ll",' will').replace("'re",' are').replace("'m",' am').replace("'d",' would').replace("n't",' not')

    res=res.replace("he's",'he has').replace("she's",'she has').replace("it's",'it is')

    res=res.replace("He's",'He has').replace("She's",'She has').replace("It's",'It is')

    res=res.replace("<br />",' ').replace("<br/>",' ')

    res=res.replace("(",' ').replace(")",' ') # this apple (brand is blablabla)

    res=res.replace(";",'.').replace(":",'.') # E.g.: I was excited to find these two teas because they are the perfect compromise: just enough caffeine to keep me going, but not enough to make my heart pound
    
    res=res.replace('"',' ') # E.g.: I do not care for the "Creamy Peanut Butter"

    return res

In [83]:
VERSION = 'v2'

aser={}

tags=['0_100000','100000_200000','200000_300000','300000_400000','400000_500000','500000_600000']
for tag in tags:
    print('Processing %s ......'%tag)
    aser={**aser,**joblib.load(source_dir+'Data/amazon_food_review_aser_event_%s.%s'%(tag,VERSION))
                               }

Processing 0_100000 ......
Processing 100000_200000 ......
Processing 200000_300000 ......
Processing 300000_400000 ......
Processing 400000_500000 ......
Processing 500000_600000 ......


In [None]:
event_sent_list=[] # (text_id, event,sentence)
for text_id, info in aser.items():
    for s,event_list in zip(info['sentences'],info['aser']):
        for e in event_list:
            event_sent_list.append((text_id,e,s))

# get all verbs from events
verbs=set()

for text_id,event, sent in tqdm.tqdm(event_sent_list):

    for w, pt in zip(event.__repr__().split(' '),event.pos_tags):

        if 'VB' in pt:

            verbs.add(w)

verbs=list(verbs)
print("Total verbs: %d"%len(verbs))

verb_tense={}
fail=[]
for v in tqdm.tqdm(verbs):
    tmp,status=get_verb_forms(conjugator,v)
    if status==-1:
        fail.append(v)
    else:
        verb_tense[v]=tmp
print("Failing verbs: %d"%len(fail))


In [None]:
# update: process ;:

event_processed=[] # (text_id, event, raw_event_text, sub_sent, sent)
event_unprocessed=[] # (text_id, event,'', '', sent)

for text_id, event, sent in tqdm.tqdm(event_sent_list):
    event_text=event.__repr__()
    hw=event_text.split(' ')[0]
    hw_pos=event.pos_tags[0]
    tw=event_text.split(' ')[-1]
    tw_pos=event.pos_tags[-1]

    sub_sents=re.split('\.|,', clean(sent))

    match_res=[]

    for ss in sub_sents:
        ss_cut=ss.split(' ')

        start_inds=set()
        end_inds=set()

        for i,sw in enumerate(ss_cut):
            sw=sw.lower()

            # (1) head word and tail word matching
            if sw==hw:
                start_ind=i
                start_inds.add(start_ind)
            if sw==tw:
                end_ind=i
                end_inds.add(end_ind)
            # (2) upper/lower case matching       
            # if sw.lower()==hw:
            #     start_ind=i
            #     start_inds.add(start_ind)

            # if sw.lower()==tw:
            #     end_ind=i
            #     end_inds.add(end_ind)

            # (3) plural matching
            if 'NN' in hw_pos:
                blob = TextBlob(hw)
                tmp=[word.pluralize() for word in blob.words]
                hw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==hw_plural:
                    start_ind=i
                    start_inds.add(start_ind)

            if 'NN' in tw_pos:
                blob = TextBlob(tw)
                tmp=[word.pluralize() for word in blob.words]
                tw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==tw_plural:
                    end_ind=i
                    end_inds.add(end_ind)

            # (4) verb-tense matching
            if 'VB' in hw_pos and hw in verb_tense:
                tmp=verb_tense[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        start_inds.add(start_ind)

            if 'VB' in tw_pos and tw in verb_tense:
                tmp=verb_tense[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        end_inds.add(end_ind)

            # (5) personal pronoun matching
            if hw in personal_pronoun:
                tmp=personal_pronoun[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        start_inds.add(start_ind)

            if tw in personal_pronoun:
                tmp=personal_pronoun[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        end_inds.add(end_ind)

        
        match_res.append((ss_cut, start_inds, end_inds))

    # determine the final start and end indices
    # loop over all possible combinations of start and end indices and find the one with the maximum overlap inside
    event_inner_cut=event_text.split(' ')[1:-1]
    start_ind=-1
    end_ind=-1
    best_ss_cnt=[]
    inner_match_cnt=0
    for ss_cut, start_inds, end_inds in match_res:
        for s in start_inds:
            for e in end_inds:
                if s<e:
                    cnt=0
                    for _x in event_inner_cut:
                        if _x in ss_cut[s+1:e]:
                            cnt+=1
                    if cnt>=inner_match_cnt:
                        inner_match_cnt=cnt
                        start_ind=s
                        end_ind=e
                        best_ss_cnt=ss_cut

    event_raw_text=' '.join(best_ss_cnt[start_ind:end_ind+1])

    if len(event_raw_text)>0:
        event_processed.append((text_id, event, event_raw_text, ' '.join(best_ss_cnt), sent))
    if len(event_raw_text)==0:
        event_unprocessed.append((text_id, event,'','', sent))


# tracking
print("Total events: %d, processed events: %d, ratio: %.2f%%"%(len(event_sent_list), len(event_processed), 100*len(event_processed)/len(event_sent_map)))

# print("Unprocessed Samples: ")

# __tmp=random.sample([(t,e,s) for t,e,_,s in event_unprocessed],10)
# for t,e,s in __tmp:
#     print("textID %s: %s == %s"%(t,e.__repr__(),s))
#     print('\n')

joblib.dump((event_processed, event_unprocessed),'Data/amazon_food_review_aser_event_restore_set')

In [87]:
print("Total events: %d, processed events: %d, ratio: %.2f%%"%(len(event_sent_list), len(event_processed), 100*len(event_processed)/len(event_sent_list)))


Total events: 4136851, processed events: 3962088, ratio: 95.78%


In [None]:
# {text_id:
#   {event_text: [event_raw_text, sub_sent]}
# }

event_processed, event_unprocessed = joblib.load('Data/amazon_food_review_aser_event_restore_set')


event_processed_map={}

for text_id, event, event_raw_text, sub_sent, sent in event_processed:
    if text_id not in event_processed_map:
        event_processed_map[text_id]={}
    event_processed_map[text_id][event.__repr__()]=[event_raw_text, sub_sent]

In [104]:
len(event_processed_map)

554170

In [105]:
joblib.dump(event_processed_map,'Data/amazon_food_review_aser_event_restore_map')

['Data/amazon_food_review_aser_event_restore_map']

In [100]:
len(event_processed)

3962088

In [None]:
event_processed_map[227801]

In [None]:
# # generate 100 samples for testing 
# cnt=0
# testset={}
# for  text_id, doc_restores in event_processed_map.items():
#     if cnt>100:
#         break
#     testset[text_id]=doc_restores
#     cnt+=1
# joblib.dump(testset,source_dir+'Data/amazon_food_review_aser_event_restore_testset100')

# generate 100 samples for testing 
test_aser={}
test_restore={}
cnt=0
for text_id, info in aser.items():
    if cnt>100:
        break
    if text_id in event_processed_map:
        test_aser[text_id]=info
        test_restore[text_id]=event_processed_map[text_id]
        cnt+=1

joblib.dump((test_aser,test_restore),source_dir+'Data/amazon_food_review_aser_event_restore_testset100.v2')

In [None]:
res=[]
for text_id, info in event_processed_map.items():
    for event, sent_list in info.items():
        res.append(sent_list[0])
    if len(res)>100000:
        break
joblib.dump(res, source_dir+'Data/test001')

# Inquire

        Compute `Restore` given list of sentences and ASER events of a reviewID.

In [None]:
import joblib
import tqdm
import random
from textblob import TextBlob
from mlconjug3 import Conjugator
import re

# get verb tense

def get_verb_forms(conjugator, verb_word: str):
    """
    compute all forms of a verb.

    :param
    :return: tuple, (present, third-person present, past, present continuous, present perfect)
    """
    present, third_person_present, past, present_continuous, present_perfect = '', '', '', '', ''

    try:
        for form in conjugator.conjugate(verb_word):
            if len(form)==4:
                _, tense, sub, verb = form
                if sub in ('we','they','you'):
                    continue
                if tense == 'indicative present' and sub == 'I':
                    present = verb
                if tense == 'indicative present' and sub == 'he/she/it':
                    third_person_present = verb
                if tense == 'indicative past tense' and sub == 'I':
                    past = verb
                if tense == 'indicative present continuous' and sub == 'I':
                    present_continuous = verb
                if tense == 'indicative present perfect' and sub == 'I':
                    present_perfect = verb
            else: # 3
                continue
        stat = 0
    except:
        #print(traceback.format_exc(()))
        stat = -1

    return (present, third_person_present, past, present_continuous, present_perfect), stat

source_dir='./'

conjugator = Conjugator(language='en')

personal_pronoun={
    'i': ['i','I','me'],
    'you': ['you','your'],
    'he': ['he','him','his'],
    'she': ['she','her'],
    'it': ['it','its'],
    'we': ['we','us','our'],
    'they': ['they','them','their'],
}

def clean(sentence:str):
    res=sentence.replace("'ve",' have').replace("'ll",' will').replace("'re",' are').replace("'m",' am').replace("'d",' would').replace("n't",' not')

    res=res.replace("he's",'he has').replace("she's",'she has').replace("it's",'it is')

    res=res.replace("He's",'He has').replace("She's",'She has').replace("It's",'It is')

    res=res.replace("<br />",' ').replace("<br/>",' ')

    res=res.replace("(",' ').replace(")",' ') # this apple (brand is blablabla)

    res=res.replace(";",'.').replace(":",'.') # E.g.: I was excited to find these two teas because they are the perfect compromise: just enough caffeine to keep me going, but not enough to make my heart pound
    
    res=res.replace('"',' ') # E.g.: I do not care for the "Creamy Peanut Butter"

    return res



In [None]:
text_id=-1

sentences= # info['sentences']

aser= # info['aser']

event_sent_list=[]
for s,event_list in zip(sentences,aser):
    for e in event_list:
        event_sent_list.append((text_id,e,s))
        
# get all verbs from events
verbs=set()

for text_id,event, sent in tqdm.tqdm(event_sent_list):

    for w, pt in zip(event.__repr__().split(' '),event.pos_tags):

        if 'VB' in pt:

            verbs.add(w)

verbs=list(verbs)
print("Total verbs: %d"%len(verbs))

verb_tense={}
fail=[]
for v in tqdm.tqdm(verbs):
    tmp,status=get_verb_forms(conjugator,v)
    if status==-1:
        fail.append(v)
    else:
        verb_tense[v]=tmp
print("Failing verbs: %d"%len(fail))



In [None]:
# update: process ;:

event_processed=[] # (text_id, event, raw_event_text, sub_sent, 
#sent)
event_unprocessed=[] # (text_id, event,'', '', sent)

for text_id, event, sent in tqdm.tqdm(event_sent_list):
    event_text=event.__repr__()
    hw=event_text.split(' ')[0]
    hw_pos=event.pos_tags[0]
    tw=event_text.split(' ')[-1]
    tw_pos=event.pos_tags[-1]

    sub_sents=re.split('\.|,', clean(sent))

    match_res=[]

    for ss in sub_sents:
        ss_cut=ss.split(' ')

        start_inds=set()
        end_inds=set()

        for i,sw in enumerate(ss_cut):
            sw=sw.lower()

            # (1) head word and tail word matching
            if sw==hw:
                start_ind=i
                start_inds.add(start_ind)
            if sw==tw:
                end_ind=i
                end_inds.add(end_ind)
            # (2) upper/lower case matching       
            # if sw.lower()==hw:
            #     start_ind=i
            #     start_inds.add(start_ind)

            # if sw.lower()==tw:
            #     end_ind=i
            #     end_inds.add(end_ind)

            # (3) plural matching
            if 'NN' in hw_pos:
                blob = TextBlob(hw)
                tmp=[word.pluralize() for word in blob.words]
                hw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==hw_plural:
                    start_ind=i
                    start_inds.add(start_ind)

            if 'NN' in tw_pos:
                blob = TextBlob(tw)
                tmp=[word.pluralize() for word in blob.words]
                tw_plural = tmp[0] if len(tmp)>0 else ''
                if sw==tw_plural:
                    end_ind=i
                    end_inds.add(end_ind)

            # (4) verb-tense matching
            if 'VB' in hw_pos and hw in verb_tense:
                tmp=verb_tense[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        start_inds.add(start_ind)

            if 'VB' in tw_pos and tw in verb_tense:
                tmp=verb_tense[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        end_inds.add(end_ind)

            # (5) personal pronoun matching
            if hw in personal_pronoun:
                tmp=personal_pronoun[hw]
                for t in tmp:
                    if sw==t:
                        start_ind=i
                        start_inds.add(start_ind)

            if tw in personal_pronoun:
                tmp=personal_pronoun[tw]
                for t in tmp:
                    if sw==t:
                        end_ind=i
                        end_inds.add(end_ind)

        
        match_res.append((ss_cut, start_inds, end_inds))

    # determine the final start and end indices
    # loop over all possible combinations of start 
    #and end indices and find the one with the maximum overlap 
    #inside
    event_inner_cut=event_text.split(' ')[1:-1]
    start_ind=-1
    end_ind=-1
    best_ss_cnt=[]
    inner_match_cnt=0
    for ss_cut, start_inds, end_inds in match_res:
        for s in start_inds:
            for e in end_inds:
                if s<e:
                    cnt=0
                    for _x in event_inner_cut:
                        if _x in ss_cut[s+1:e]:
                            cnt+=1
                    if cnt>=inner_match_cnt:
                        inner_match_cnt=cnt
                        start_ind=s
                        end_ind=e
                        best_ss_cnt=ss_cut

    event_raw_text=' '.join(best_ss_cnt[start_ind:end_ind+1])

    if len(event_raw_text)>0:
        event_processed.append((text_id, event, event_raw_text,
                                ' '.join(best_ss_cnt), sent))
    if len(event_raw_text)==0:
        event_unprocessed.append((text_id, event,'','', sent))


# tracking
print("Total events: %d, processed events: %d, ratio: %.2f%%"%
      (len(event_sent_list), len(event_processed), 
       100*len(event_processed)/len(event_sent_map)))

event_processed_map={}

for text_id, event, event_raw_text, sub_sent, sent \
in event_processed:
    if text_id not in event_processed_map:
        event_processed_map[text_id]={}
    event_processed_map[text_id][event.__repr__()]\
    =[event_raw_text, sub_sent]