# Automatic Labelling Examples

For more involved examples, see ./dtm_analysis_examples.ipynb

In [1]:
from dtm_toolkit.auto_labelling import AutoLabel

## Let's create an example thesaurus

In [13]:
thesaurus = {
    "soccer": "sport",
    "basketball": "sport",
    "painting": "art",
    "musician": "art",
    "tennis": "sport",
    "tennis player": "sport",
    "ronaldo": "sport",
    "lebron james": "sport",
    "james taylor": "art",
    "strings": "art"
}

In [14]:
# initialise the auto labelling class
autolabel = AutoLabel(thesaurus, phrase_col=None, label_col=None, spacy_lang="en_core_web_sm", preprocess=False, n_process=None, batch_size=256)

In [15]:
"""Here we simulate a particular topic that pertains to sports, we hope it gets labelled correctly.
"""
topic = [(0.6, 'basketball'), (0.04, 'layup'), (0.02, 'referee'), (0.01, 'court'), (0.01, 'tennis'), (0.3, 'strings'), (0.02, 'run')]
labels = autolabel.get_topic_labels(topics=[topic], score_type="tfidf")
labels

[[('sport', 0.06)]]

In [16]:
"""We see that the score for 'sport' label is marginally higher than 'art'."""
autolabel.get_topic_labels(topics=[topic], raw=True, score_type="tfidf")

[Counter({'art': 0.04158883083359671, 'sport': 0.05545177444479562})]

In [17]:
"""Let's try this time with the embedding strategy which doesn't rely on exact matches, but instead
on the contextual embeddings of each term in the top topic terms. We see that 'sport' receives a 
much higher score than 'art' in this case.
"""
labels = autolabel.get_topic_labels(topics=[topic], raw=True)
labels

Initialising gloVe embeddings...


[Counter({'art': 0.58206457, 'sport': 0.8463534712791443})]