# Import libraries

In [1]:
import re
import pandas as pd
import pickle

# Convert the C5 tagset to our tagset

In [2]:
def bnc_to_ud(tag):
    if "AJ" in tag:
        return "ADJ"
    if tag == "AT0":
        return "DET"
    if "AV" in tag:
        return "ADV"
    if tag == "CJC":
        return "CCONJ"
    if tag in ["CJS", "CJT"]:
        return "SCONJ"
    if tag in ["CRD", "ORD"]:
        return "NUM"
    if tag == "DPS":
        return "PRON"
    if tag in ["DT0", "DTQ"]:
        return "DET"
    if tag == "EX0":
        return "PRON"
    if tag == "ITJ":
        return "INTJ"
    if tag in ["NN0","NN1","NN2"]:
        return "NOUN"
    if tag == "NPO":
        return "PROPN"
    if "PN" in tag:
        return "PRON"
    if tag in ["POS","TO0","XX0","ZZ0"]:
        return "PART"
    if "PR" in tag:
        return "ADP"
    if "PU" in tag:
        return "PUNCT"
    if tag == "UNC":
        return "NOUN"
    if tag.startswith("V"):
        if tag[1] != "V":
            return "AUX"
        else:
            return "VERB"

In [3]:
with open("../corpus/BNClemma10_3_with_c5.txt", 'r', encoding='utf-8') as file:
    lines = file.read().splitlines()

In [4]:
word_lemma_dict = {}

In [5]:
for line in lines:
    parts = line.split("->")
    if len(parts) <2:
        continue
    lemma = parts[0].strip().lower()
    forms = parts[1].split(",")
    for form in forms:
        data = form.split(">")
        tag = bnc_to_ud(data[0].replace('<','').strip())
        word = data[1].strip().lower()
        if word in word_lemma_dict:
            word_lemma_dict[word][tag]=lemma
        else:
            word_lemma_dict[word]={tag:lemma}

In [6]:
len(word_lemma_dict.keys())

23035

In [None]:
with open('../models/lemma_dict.pkl', 'wb') as f:
    pickle.dump(word_lemma_dict, f)

# Rule-based lemmatizer for plural noun cases

In [7]:
def inflect_noun_singular(word):
    irregular_dict = pickle.load(open('../models/noun_exception.pkl','rb'))
    consonants = "bcdfghjklmnpqrstwxyz"
    vowels = "aeiou"
    word = str(word).lower()
    if len(word) < 2:
        return word
    if word in irregular_dict:
        return irregular_dict[word]
    if word.endswith('s'):
        if len(word) > 3:
            #Leaves, wives, thieves
            if word.endswith('ves'):
                if len(word[:-3]) > 2:
                    return word.replace('ves','f')
                else:
                    return word.replace('ves','fe')
            #Parties, stories
            if word.endswith('ies'):
                return word.replace('ies','y')
            #Tomatoes, echoes
            if word.endswith('es'):
                if word.endswith('ses') and word[-4] in vowels:
                    return word[:-1]
                if word.endswith('zzes'):
                    return word.replace('zzes','z')
                return word[:-2]
            if word.endswith('ys'):
                return word.replace('ys','y')
            return word[:-1]
    return word

In [8]:
dict_from_csv = pd.read_csv('../corpus/noun_exceptions.csv',index_col=1, squeeze=True).to_dict()

In [None]:
with open('../models/noun_exception.pkl', 'wb') as f:
    pickle.dump(dict_from_csv, f)

# Lemmatizer function

In [12]:
def lemmatize(word, pos):
    if word in word_lemma_dict:
        if pos in word_lemma_dict[word]:
            return (word, word_lemma_dict[word][pos])
    if pos == 'NOUN':
        return (word, inflect_noun_singular(word))
    return (word, word)

In [13]:
words = [('living', 'ADJ'), ('living', 'NOUN'),('living','VERB'), ('zeroes','NOUN')]

In [14]:
for word_tuple in words:
    print(lemmatize(word_tuple[0], word_tuple[1]))

('living', 'living')
('living', 'living')
('living', 'live')
('zeroes', 'zero')
