# IR Lab Tutorial: Lemmatization

This tutorial shows how to configure and use custom lemmatizer in PyTerrier.

**Attention:** The scenario below is cherry-picked to explain the concept of lemmatization with a minimal example.


In [None]:
!wget https://files.webis.de/software/pyterrier-plugins/custom-terrier-token-processing-1.0-SNAPSHOT-jar-with-dependencies.jar -O /root/.pyterrier/custom-terrier-token-processing-0.0.1.jar

In [None]:
import pyterrier as pt
import pandas as pd

pd.set_option("display.max_colwidth", 0)

if not pt.started():
    pt.init(boot_packages=["mam10eks:custom-terrier-token-processing:0.0.1"])
    from jnius import autoclass

In [None]:
def lemmatize(t):
    lemmatizer = autoclass("org.terrier.terms.StanfordLemmatizer")()
    return lemmatizer.stem(t)


print("are =>", lemmatize("are"))
print("producer =>", lemmatize("producer"))
print("produces =>", lemmatize("produces"))
print("corpus =>", lemmatize("corpus"))
print("corpora =>", lemmatize("corpora"))

In [None]:
def stem(t):
    stemmer = autoclass("org.terrier.terms.PorterStemmer")()
    return stemmer.stem(t)


print("are =>", stem("are"))
print("producer =>", stem("producer"))
print("produces =>", stem("produces"))
print("corpus =>", stem("corpus"))
print("corpora =>", stem("corpora"))

In [None]:
documents = [
    {"docno": "d1", "text": "A corpus is a dataset consisting of language resources."},
    {
        "docno": "d2",
        "text": "A corpus may contain documents in a single language or multiple languages.",
    },
    {
        "docno": "d3",
        "text": "A semantic treebank is a collection of natural language sentences annotated with a meaning representation.",
    },
    {
        "docno": "d4",
        "text": "A parallel text places two translation alongside each other which is often used to train machine learning translation.",
    },
]

topics = pd.DataFrame(
    [
        {"qid": "1", "query": "text corpora"},
    ]
)

qrels = pd.DataFrame(
    [
        {"qid": "1", "docno": "d1", "relevance": 1},
        {"qid": "1", "docno": "d2", "relevance": 1},
    ]
)

In [None]:
indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, stemmer="PorterStemmer")
index_ref = indexer.index(documents)
index = pt.IndexFactory.of(index_ref)

bm25 = pt.BatchRetrieve(index, wmodel="BM25")

pt.Experiment([bm25], topics, qrels, eval_metrics=["ndcg_cut_5"])

In [None]:
indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, stemmer="StanfordLemmatizer")
index_ref = indexer.index(documents)
index = pt.IndexFactory.of(index_ref)

bm25 = pt.BatchRetrieve(index, wmodel="BM25")

pt.Experiment([bm25], topics, qrels, eval_metrics=["ndcg_cut_5"])