### Read data

In [1]:
import psydata

import numpy as np
import pandas as pd

In [2]:
sentences = psydata.get_workfile("rapportage_sentences") # already split in pipelines

sentences = sentences.sample(10000) # sample

### Correct spelling

In [3]:
from psynlp.spelling import SpellChecker

In [4]:
sc = SpellChecker()

In [5]:
sentences['text_corrected'] = sentences['text'].apply(lambda x : sc.correct(x))

### Find entities

In [6]:
entity_phrases = {} 

# some examples
entity_phrases['abdominaal'] = ['buikpijn', 'diarree', 'buikklachten', 'buikpijnklachten', 'buikkrampen', 'diaree', 'maagpijn', 'misselijkheid', 'maagklachten', 'misselijkheidsklachten']

entity_phrases['angst'] = ['angstig', 'bang', 'angstig', 'anstig', 'angstiger', 'paniekerig', 'paniek', 'angstklachten']

entity_phrases['spanning'] = ['gespannen', 'onrustig', 'prikkelbaar', 'geprikkeld', 'geagiteerd', 'geladen', 'geaggiteerd']

entity_phrases['depressie'] = ['depressie', 'depressies', 'stemmingsklachten', 'somberheidsklachten', 'stemmingsstoornis', 'depressiviteit', 'depressieve', 'stemmingsproblematiek', 'stemmingsprobleem', 'stemmingsproblemen', 'stemmings', 'stemmingstoornis']

entity_phrases['vermoeid'] = ['vermoeid', 'versufd', 'apatisch', 'energieloos', 'initiatiefloos', 'lamlendig', 'futloos', 'lusteloos', 'apathisch', 'inactief', 'duf', 'suf', 'suffig', 'versuft']

In [7]:
from psynlp.entity import BasicEntityMatcher

In [8]:
bem = BasicEntityMatcher(entity_phrases)

In [9]:
sentences['entities'] = sentences['text_corrected'].apply(lambda x : bem.extract_entities(x))

### Determine context

In [10]:
from psynlp.context import ContextMatcher

In [11]:
cm = ContextMatcher()

In [None]:
sentences.apply(lambda x : cm.match_context(x['text_corrected'], x['entities']), axis=1)

### Process output into variables

In [13]:
num_entities = np.sum(sentences['entities'].apply(lambda x : len(x)))

print("Total number of entities matched = {}".format(num_entities))

Total number of entities matched = 212


In [14]:
## Example: historische depressie

def has_depression_historical(entities):
    
    # Filter rule == depressie
    entities = [e for e in entities if e.rule == 'depressie']
    
    # Filter HISTORICAL in context
    entities = [e for e in entities if "HISTORICAL" in e.context]
    
    return len(entities) > 0

sentences['historical_depression'] = sentences['entities'].apply(lambda x : has_depression_historical(x))

print("Total Number of mentions of historical depression = {}".format(sentences['historical_depression'].sum()))

Total Number of mentions of historical depression = 1


In [15]:
## Example: count only relevant entities

def count_relevant_entities(entities):
    
    entities = [e for e in entities if "PATIENT" in e.context]
    entities = [e for e in entities if "AFFIRMED" in e.context]
    entities = [e for e in entities if "PLAUSIBLE" in e.context]
    entities = [e for e in entities if "CURRENT" in e.context]
    
    return len(entities)   

sentences['num_relevant_entities'] = sentences['entities'].apply(lambda x : count_relevant_entities(x))

print("Total number of relevant entities = {}".format(sentences['num_relevant_entities'].sum()))

Total number of relevant entities = 167


In [16]:
## Further aggregate on text/patient level
texts = sentences.groupby("text_hash").agg({'historical_depression' : min, # any mention 
                                            'num_relevant_entities' : sum  # count all mentions
                                           }).reset_index()