## Read vocabs from solr 

In [8]:
from collections import defaultdict

with open('lemma/wordlemma-stat-fr.csv', 'r', encoding="utf8") as file:
    d = {line.rstrip('\n').split(',')[0]:int(line.rstrip('\n').split(',')[1]) for line in file.readlines()}
    fr_vocab_lemmas = defaultdict(lambda:0, d)
    print(fr_vocab_lemmas['a'])
    print(fr_vocab_lemmas['ferguson'])

0
417673


In [4]:
from solr_client import SolrClient
solr1 = SolrClient('localhost',8986)
solr2 = SolrClient('be-plw-tst-0050',8983)
solr2.auth("solr","SolrRocks")
solr1.auth("solr","SolrRocks")
#solr2 = SolrClient('localhost',8981)
solr2.search('textsearch', 'STARTVEILIGHEIDSSCHAKELAAR','nl')

{'numFound': 1, 'ids': ['50554053']}

# Stemmers

In [4]:
!python -m spacy download nl_core_news_sm
!python -m spacy download fr_core_news_md

Collecting nl-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-3.4.0/nl_core_news_sm-3.4.0-py3-none-any.whl (12.8 MB)


You should consider upgrading via the 'c:\Projects\tvh-search-labeling\venv\Scripts\python.exe -m pip install --upgrade pip' command.


Installing collected packages: nl-core-news-sm
Successfully installed nl-core-news-sm-3.4.0
[+] Download and installation successful
You can now load the package via spacy.load('nl_core_news_sm')


In [24]:
import spacy
from solr_client import SolrClient
from stemmers import Stemmer
from stemmers import SpacyStemmer
from stemmers import TruncateStem


    
solr1 = SolrClient('localhost',8986)

#French stemmers
toy_stemmer = Stemmer(solr1, index = 'textsearch', fieldtype='query', field = 'text_nlp_fr_z', name = 'FR nlp')
lightFr_stemmer = Stemmer(solr1, index = 'textsearch', fieldtype='query', field = 'text_test_fr_z', name = 'FR LightFr')
porterFr_stemmer = Stemmer(solr1, index = 'textsearch', fieldtype='fieldvalue', field = 'text_test_fr_z', name = 'FR Porter')
spacy_stemmer = SpacyStemmer('fr_core_news_md', name = 'FR spicy')

#choose etalon stemmer for operations when building vocab
etalon_stemmer = spacy_stemmer

# Dutch stemmers
porterNL_stemmer = Stemmer(solr1, index = 'textsearch', fieldtype='query', field = 'text_test_nl_z', name='NL Porter')
spacyNL_stemmer = SpacyStemmer('nl_core_news_sm', name='NL spicy')
nl_stemmer = Stemmer(solr1, index = 'textsearch', fieldtype='query', field = 'text_nlp_nl_z', name='NL nlp')

# Truncate stemmers (remove last N characters)
trunc_stemmers = [TruncateStem(i) for i in list(range(1, 7))]

print(toy_stemmer.stem_word("trous"))
print(spacy_stemmer.stem_word("long-métrage"))
print(lightFr_stemmer.stem_word("trous"))
print(porterFr_stemmer.stem_word("long-métrage"))

print(porterNL_stemmer.stem_word("accessoires"))
print(spacyNL_stemmer.stem_word("accessoires"))
print(nl_stemmer.stem_word("accessoires"))

#Stemmers to test
#stemmers=[porterNL_stemmer, spicyNL_stemmer, nl_stemmer]
stemmers=[toy_stemmer, spacy_stemmer, lightFr_stemmer, porterFr_stemmer]
#stemmers=[spacy_stemmer]
#stemmers=[]

stemmers+=trunc_stemmers

trou
long
trous
long-metrag
accessoires
accessoire
accessoire


## Weight function

function returning weight by word frequency


In [21]:

def get_word_weight(word):
    if word in fr_vocab_lemmas.keys():
        lemma = word
    else:
        lemma = etalon_stemmer.stem_word(word)
        
    if fr_vocab_lemmas[lemma] > 1000:
        return 10
    elif fr_vocab_lemmas[lemma] > 0:
        return 1
    return 0.1

print(get_word_weight("barre"))
    

1



### Over-Stemming vs Under-Stemming calculation

Two very useful metrics to quantify a stemmer with are the over and under stemming errors. Under-stemming is when two related words do not reduce to the same stem. We saw that using the previous stemmer, the word *tasting* conflated to the stem *tast* whereas the word *taste* conflated to the stem *taste*. The two words are related inflections but the stemmer doesn't reduce the two to the same.

Similarly, consider the words *red* and *ring*. These two are totally unrelated words. However, using our stemmer, these two are conflated to the same stem i.e. *r*. This is over-stemming i.e. reducing two unrelated words into a same stem.

### Paice's Method

Counting over-stemming and under-stemming errors seems to be a very useful evaluation scheme for stemmers. After all, this method doesn't need the concept of a correct stem and instead only requires that two related words conflate to a same stem and two unrelated words to different. This is precisely what Paice's method does. To illustrate the concept further, let us first create a toy stemmer.

In [25]:
lang='fr'

with open('lemma/concept_groups_{0}.txt'.format(lang), 'r',encoding='utf8') as file:
    concept_groups = [line.rstrip('\n').split(",") for line in file.readlines()]

### Calculate the metrics  𝐺𝐷𝑀𝑇  and  𝐺𝐷𝑁𝑇  for the concept group loaded above.

In [26]:

#concept_groups = concept_groups[0:10]
def GDMT(cg):
    gdmt = 0
    for g in cg:
        ng = len(g)
        gdmt += 0.5 * ng * (ng-1)
    return gdmt

def GDNT(cg):
    gdnt = 0
    W = sum([len(g) for g in cg])
    for g in cg:
        ng = len(g)
        gdnt += 0.5 * ng * (W-ng)
    return gdnt

print ("GDMT = ", GDMT(concept_groups))
print ("GDNT = " , GDNT(concept_groups))

GDMT =  1898.0
GDNT =  8967832.0


In [27]:
from collections import Counter
def GUMT(cg, stemmer, debug=False):
    gumt = 0
    for g in cg:
        if debug:
            print("For group " , repr(g) )
        stems = stemmer.stem_words(g)
        ng = len(g)
        unique_stems = Counter(stems)
        if debug:
            print("Unique stems with their instances are: " ,
             repr(dict(unique_stems)))
        umt = 0
        for unique_stem, count in unique_stems.items():
            umt += 0.5 * count * (ng-count)
        gumt += umt
        if debug:
            print("Unmerged concept pairs in this group = ", umt)

    return gumt

def UI(cg, stemmer):
    return GUMT(cg, stemmer)/GDMT(cg)

for stemmer in stemmers:
    print(stemmer.name, "Understemming Error = %.2f%%" % (UI(concept_groups, stemmer)*100))



FR nlp Understemming Error = 9.91%
FR spicy Understemming Error = 16.28%
FR LightFr Understemming Error = 11.33%
FR Porter Understemming Error = 3.58%
TruncateStem1 Understemming Error = 99.42%
TruncateStem2 Understemming Error = 98.63%
TruncateStem3 Understemming Error = 98.26%
TruncateStem4 Understemming Error = 92.31%
TruncateStem5 Understemming Error = 82.56%
TruncateStem6 Understemming Error = 67.02%


## Wrongly merged concept pairs and Overstemming Error
As discussed earlier, if a non-concept pair is merged together, it is an overstemming error. To count the number of these wrongly merged pairs, a stem group has to be constructed. Where a concept group meant the group of words that should be reduced to a same stem, a stem group is the group of words that actually get reduced to the same stem by the stemmer. For example, consider our concept group from before. It would yield the following stem groups.

In [28]:
from collections import defaultdict
from json import dumps

def stem_groups(cg, stm):
    """
    cg is the concept group
    stm is the stemmer
    """
    stem_group = defaultdict(list)
    stem_group_inverted_cg = defaultdict(list)
    for idx, g in enumerate(cg):
        for word in g:
            stem = stm.stem_word(word)
            stem_group[stem].append(word)
            #if len(stem_group_inverted_cg[stem])>0 and not str(idx) in stem_group_inverted_cg[stem]:
             #   print("overstemming case:",word,stem, ' '.join(str(stem_group_inverted_cg[stem])), str(idx)) 
            stem_group_inverted_cg[stem].append(str(idx))
    return stem_group, stem_group_inverted_cg

#print(dumps(stem_groups(concept_groups, porterFr_stemmer), indent=4))

In [29]:
from collections import Counter
def GWMT(stem_group, stem_group_inverted_cg, debug=False):
    gwmt = 0
    for sg_key in stem_group.keys():
        g = stem_group[sg_key]
        if debug:
            print("For group " , repr(g) )
        ng = len(g)
        unique_stems = Counter( stem_group_inverted_cg[sg_key])
        if debug:
            print("Unique cg with their instances are: " ,
             repr(dict(unique_stems)))
        wmt = 0
        for unique_stem, count in unique_stems.items():
            coef = get_word_weight(unique_stem) # weighted coef.
            wmt += 0.5 * count * (ng-count) * coef            
        gwmt += wmt
        if debug:
            print("Overmerged concept pairs in this group = ", wmt)

    return gwmt

def WI(cg, stm):
    stem_group, stem_group_inverted_cg = stem_groups(cg,stm)
    return GWMT(stem_group, stem_group_inverted_cg)/GDNT(cg)
for stemmer in stemmers:
    print(stemmer.name, "Overstemming Error = %.4f%%" % (WI(concept_groups,  stemmer)*100))


FR nlp Overstemming Error = 0.0019%
FR spicy Overstemming Error = 0.0041%
FR LightFr Overstemming Error = 0.0054%
FR Porter Overstemming Error = 0.0063%
TruncateStem1 Overstemming Error = 0.0031%
TruncateStem2 Overstemming Error = 0.0254%
TruncateStem3 Overstemming Error = 0.2631%
TruncateStem4 Overstemming Error = 1.7744%
TruncateStem5 Overstemming Error = 6.3259%
TruncateStem6 Overstemming Error = 14.9589%
