In [1]:
%cd ..
import pandas as pd
from src.fall_regex import process_sentence

c:\Users\User\OneDrive\current_projects_jk\fall_regex_nl


# Experiment 1

Full vocabulary, no post-processing.

In [2]:
vocab_version = 'vocab_v01'
vocab_df = pd.read_excel(f"vocabulary/{vocab_version}.xlsx", sheet_name='fall_cue_VOCAB', index_col=0)

In [3]:
# gold: FALL
# output: FALL

sentence = 'pt is gestruikeld over zijn catherslang en heeft hierbij een schaafwond opgelopen.'

process_sentence(sentence, vocab_df.regex, pp_rules=None)

{'fall': True,
 'vocab_found': True,
 'pp_found': None,
 'vocab_matches': [<re.Match object; span=(6, 17), match='gestruikeld'>],
 'pp_matches': []}

In [4]:
# gold: NO FALL
# output: FALL

sentence = 'tijdens de maaltijd was de pt luid aan het smakken.'

process_sentence(sentence, vocab_df.regex, pp_rules=None)

{'fall': True,
 'vocab_found': True,
 'pp_found': None,
 'vocab_matches': [<re.Match object; span=(43, 50), match='smakken'>],
 'pp_matches': []}

# Experiment 2

Modified (smaller) vocabulary, no post-processing.

Based on analysis of examples like the above, 19 items for which less than 20% of the matches were true positives were removed from the vocabulary (including *smakken*). See the report for details.

In [5]:
vocab_version = 'vocab_v02'
vocab_df = pd.read_excel(f"vocabulary/{vocab_version}.xlsx", sheet_name='fall_cue_VOCAB', index_col=0)

In [6]:
# gold: NO FALL
# output: NO FALL

sentence = 'tijdens de maaltijd was de pt luid aan het smakken.'

process_sentence(sentence, vocab_df.regex, pp_rules=None)

{'fall': False,
 'vocab_found': False,
 'pp_found': None,
 'vocab_matches': [],
 'pp_matches': []}

In [7]:
# gold: NO FALL
# output: FALL

sentence = 'Mw voelt zich goed, valt goed af richting streef gewicht.'

process_sentence(sentence, vocab_df.regex, pp_rules=None)

{'fall': True,
 'vocab_found': True,
 'pp_found': None,
 'vocab_matches': [<re.Match object; span=(20, 24), match='valt'>],
 'pp_matches': []}

# Experiment 3

Modified (smaller) vocabulary, post-processing rules.

The post-processing rules remove some of the most frequent false positives, e.g. *in slaap vallen*, *afvallen*. See the report for details.

In [8]:
pp_df = pd.read_excel('vocabulary/pp_rules_v01.xlsx', sheet_name='fall_cue_VOCAB', index_col=0)

In [9]:
# gold: NO FALL
# output: NO FALL

sentence = 'Mw voelt zich goed, valt goed af richting streef gewicht.'

process_sentence(sentence, vocab_df.regex, pp_rules=pp_df.regex)

{'fall': False,
 'vocab_found': True,
 'pp_found': True,
 'vocab_matches': [<re.Match object; span=(20, 24), match='valt'>],
 'pp_matches': [<re.Match object; span=(20, 32), match='valt goed af'>]}

In [10]:
# gold: NO FALL
# output: NO FALL

sentence = 'Mw is zeer traag met eten en drinken valt tussendoor in slaap.'

process_sentence(sentence, vocab_df.regex, pp_rules=pp_df.regex)

{'fall': False,
 'vocab_found': True,
 'pp_found': True,
 'vocab_matches': [<re.Match object; span=(37, 41), match='valt'>],
 'pp_matches': [<re.Match object; span=(37, 61), match='valt tussendoor in slaap'>]}