In [2]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

print(sklearn.__version__)

0.18.1


In [5]:
import features #functions defining word features
import data_parser #function(s) to load train and test data from .txt files


In [None]:
train_sents=data_parser.load('train.txt')
test_sents=data_parser.load('test.txt')
#sents is a list of lists, each list corresponding to a sentence in the 'train.txt'
#The list has the format (word,category,label) for each word in sentence, each label corresponding to
#the entity of the word. For current training datasets, these are-
#(Date-Date, Num-Number of tickets, Dest-Destination, Src-Source Location)
#category is retreived from the Hindi WordNet database,return N for noun,
#V for verb,AV for adverb and AJ for adjective. Return X if not found in the database  

freq=features.frequencies('train.txt') #returns a dictionary of word frequencies in the file
        

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    category = sent[i][1]
    features = [
        'bias',
        'word=' + word,
        'word.isdigit=%s' % features.isdigit(word),
        'category=' + str(category),
        'freq='+ freq[word],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        category1 = sent[i-1][1]
        features.extend([
            '-1:word=' + word,
            '-1:word.isdigit=%s' % features.isdigit(word1),
            '-1:category=' + str(category1),
            '-1:freq='+ freq[word1],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        category1 = sent[i+1][1]
        features.extend([
            '+1:word=' + word,
            '+1:word.isdigit=%s' % features.isdigit(word1),
            '+1:category=' + str(category1),
            '+1:freq='+ freq[word1],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for word, category, label in sent]

def sent2tokens(sent):
    return [word for word,postag,label in sent]

In [None]:
sent2features(train_sents[0])[0]

In [None]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [None]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)


In [None]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})



In [None]:
trainer.params() #List set of possible params


In [None]:
%%time
trainer.train('hindiNER.crfsuite') #train and save model to file 'hindiNER.crfsuite'

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('hindiNER.crfsuite')

In [None]:
example_sent = test_sents[0]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

In [None]:
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [None]:
#Evaluate model performance