In [1]:
import nltk
import dill
import tqdm
import pprint
from pathlib import Path
from nltk.corpus import treebank
from nltk.tag import CRFTagger
from nltk.tag import HiddenMarkovModelTrainer
from nltk.tag import PerceptronTagger
from nltk.tag import TnT
nltk.download('treebank')

MODELS_PATH = Path("./models")

[nltk_data] Downloading package treebank to /Users/victor/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [2]:
def saveModel(model, path):
    with open(path, "wb") as f:
        dill.dump(model, f)

def evaluateModel(testData, model):
    acc = round(HMM.evaluate(testData), 3)
    print('accuracy: ' + str(acc))
    return acc

def trainHMM(trainData, modelPath):
    trainer = HiddenMarkovModelTrainer()
    HMM = trainer.train_supervised(trainData)
    saveModel(HMM, modelPath)
    return HMM

def trainTnT(trainData, modelPath):
    tnT = TnT()
    tnT.train(trainData)
    saveModel(tnT, modelPath)
    return tnT

def trainPerceptron(trainData, modelPath):
    PER = PerceptronTagger(load=False)
    PER.train(trainData)
    saveModel(PER, modelPath)
    return PER

def trainCRF(trainData, modelPath):
    CRF = CRFTagger()
    CRF.train(trainData, str(modelPath))
    return CRF


In [3]:
models = {k:dict() for k in ("HMM", "TnT", "PER", "CRF")}
for model in models:
    Path(MODELS_PATH / model).mkdir(parents=True, exist_ok=True)

In [4]:
for trainingSentences in tqdm.tqdm(range(500, 3001, 500)):
    trainData = treebank.tagged_sents()[:trainingSentences]
    models['HMM'][trainingSentences] = trainHMM(trainData, MODELS_PATH / "HMM" / f'hmm_{trainingSentences}')
    models['TnT'][trainingSentences] = trainTnT(trainData, MODELS_PATH / "TnT" / f'tnt_{trainingSentences}')
    models['PER'][trainingSentences] = trainPerceptron(trainData, MODELS_PATH / "PER" / f'per_{trainingSentences}')
    models['CRF'][trainingSentences] = trainCRF(trainData, MODELS_PATH / "CRF" / f'crf_{trainingSentences}')
print()
pprint.pprint(models)

100%|██████████| 6/6 [03:03<00:00, 30.66s/it]{'CRF': {500: <nltk.tag.crf.CRFTagger object at 0x120f59310>,
         1000: <nltk.tag.crf.CRFTagger object at 0x121327050>,
         1500: <nltk.tag.crf.CRFTagger object at 0x125dcc550>,
         2000: <nltk.tag.crf.CRFTagger object at 0x122dbd190>,
         2500: <nltk.tag.crf.CRFTagger object at 0x12ba7c410>,
         3000: <nltk.tag.crf.CRFTagger object at 0x12957e410>},
 'HMM': {500: <HiddenMarkovModelTagger 42 states and 3305 output symbols>,
         1000: <HiddenMarkovModelTagger 44 states and 5391 output symbols>,
         1500: <HiddenMarkovModelTagger 45 states and 6913 output symbols>,
         2000: <HiddenMarkovModelTagger 46 states and 8579 output symbols>,
         2500: <HiddenMarkovModelTagger 46 states and 9786 output symbols>,
         3000: <HiddenMarkovModelTagger 46 states and 10779 output symbols>},
 'PER': {500: <nltk.tag.perceptron.PerceptronTagger object at 0x120af3f50>,
         1000: <nltk.tag.perceptron.Perceptr

In [None]:
testData = treebank.tagged_sents()[3000:]
for modelType in models:
    for trainingSentences, model in models[model]:
        print