In [34]:
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics
nltk.download('brown')
from nltk.corpus import brown
# nltk.data.load()
# nltk.data.path.append('/path/to/nltk_data')
# Step 1: Data Preparation
corpus = brown.tagged_sents(categories='news')[:5000]
train_sents = corpus[:4000]  # Use first 5000 sentences for training
test_sents = corpus[4000:]  # Use remaining sentences for testing


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Computer-_-\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [36]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        prev_word = sent[i-1][0]
        features.update({
            'prev_word.lower()': prev_word.lower(),
            'prev_word.istitle()': prev_word.istitle(),
            'prev_word.isupper()': prev_word.isupper()
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        next_word = sent[i+1][0]
        features.update({
            'next_word.lower()': next_word.lower(),
            'next_word.istitle()': next_word.istitle(),
            'next_word.isupper()': next_word.isupper()
        })
    else:
        features['EOS'] = True
    return features


In [37]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [tag for word, tag in sent]


In [38]:
train_features = [sent2features(sent) for sent in train_sents]
train_labels = [sent2labels(sent) for sent in train_sents]
test_features = [sent2features(sent) for sent in test_sents]
test_labels = [sent2labels(sent) for sent in test_sents]


In [42]:
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(train_features, train_labels)

In [None]:
pred_labels = crf.predict(test_features)
print(metrics.flat_accuracy_score(test_labels, pred_labels))

In [None]:
# Step 5: POS Tagging
sent = "This is a test sentence"
tokens = nltk.word_tokenize(sent.lower())
test_features = [sent2features([(token, '')]) for token in tokens]
tags = crf.predict(test_features)
print(list(zip(tokens, tags[0])))