# IR Lab WiSe 2023: Stemming

This tutorial shows how to use a stemmer in PyTerrier.

**Attention:** The scenario below is cherry-picked to explain the concept of stopword lists with a minimal example.


In [3]:
import pyterrier as pt
import pandas as pd
pd.set_option('display.max_colwidth', 0)

if not pt.started():
    pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1'])
    from jnius import autoclass

In [4]:
def stem_porter(t):
    stemmer = autoclass("org.terrier.terms.PorterStemmer")()
    return stemmer.stem(t)

print('are =>', stem_porter('are'))
print('producer =>', stem_porter('producer'))
print('produces =>', stem_porter('produces'))
print('corpora =>', stem_porter('corpus'))

are => ar
producer => produc
produces => produc
corpora => corpu


In [5]:
def stem_krovetz(t):
    stemmer = autoclass("org.terrier.terms.LemurKrovetzStemmer")()
    return stemmer.stem(t)

print('are =>', stem_krovetz('are'))
print('producer =>', stem_krovetz('producer'))
print('produces =>', stem_krovetz('produces'))
print('corpora =>', stem_krovetz('corpus'))

are => are
producer => producer
produces => produce
corpora => corpus


In [8]:
documents = [
    {'docno': 'd1', 'text': 'producer'},
    {'docno': 'd2', 'text': 'produce'},
    {'docno': 'd2', 'text': 'produces'},
    {'docno': 'd4', 'text': 'tbd'},
]

topics = pd.DataFrame([
    {'qid': '1', 'query': 'produces'},
])

qrels = pd.DataFrame([
    {'qid': '1', 'docno': 'd1', 'relevance': 1},
    {'qid': '1', 'docno': 'd2', 'relevance': 1},
    {'qid': '1', 'docno': 'd3', 'relevance': 1},
])

In [9]:
indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, stemmer=None)
index_ref = indexer.index(documents)
index = pt.IndexFactory.of(index_ref)

bm25 = pt.BatchRetrieve(index, wmodel="BM25")

pt.Experiment([bm25], topics, qrels, eval_metrics=['ndcg_cut_5'])

Unnamed: 0,name,ndcg_cut_5
0,BR(BM25),0.469279


In [10]:
indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, stemmer='PorterStemmer')
index_ref = indexer.index(documents)
index = pt.IndexFactory.of(index_ref)

bm25 = pt.BatchRetrieve(index, wmodel="BM25")

pt.Experiment([bm25], topics, qrels, eval_metrics=['ndcg_cut_5'])

Unnamed: 0,name,ndcg_cut_5
0,BR(BM25),0.765361


In [11]:
indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, stemmer='LemurKrovetzStemmer')
index_ref = indexer.index(documents)
index = pt.IndexFactory.of(index_ref)

bm25 = pt.BatchRetrieve(index, wmodel="BM25")

pt.Experiment([bm25], topics, qrels, eval_metrics=['ndcg_cut_5'])

Unnamed: 0,name,ndcg_cut_5
0,BR(BM25),0.469279
