In [1]:
from nltk.corpus import conll2002
from sklearn.feature_extraction import DictVectorizer
from nltk.stem.snowball import SpanishStemmer
import re
import pycrfsuite

# Helper Function for Word Shape

In [2]:
def word_shape(word):
    word = re.sub('[A-Z]|[À-Ú]', 'X', word)
    word = re.sub('[a-z]|[à-ú]', 'x', word)
    word = re.sub('[0-9]', 'd', word)
    return word

In [3]:
def short_word_shape(word):
    shape = word_shape(word)
    
    short_shape = ''
    prev_shape = ''
    is_punctuation = False

    for i in range(len(shape)):
        if shape[i] == 'X':
            curr_shape = 'X'
        elif shape[i] == 'x':
            curr_shape = 'x'
        elif shape[i] == 'd':
            curr_shape = 'd'
        else:
            is_punctuation = True
            curr_shape = shape[i]
        if is_punctuation or curr_shape != prev_shape:
            short_shape += curr_shape
            
        is_punctuation = False
        prev_shape = curr_shape
        
    return short_shape

# Get Features

In [4]:
def getfeats(word, pos, o):
    """ This takes the word in question and
    the offset with respect to the instance
    word """
    
    has_hyphen = 0
    if "-" in word:
        has_hyphen = 1
        
    has_apostrophe = 0
    if "'" in word:
        has_apostrophe = 1
        
    stemmer = SpanishStemmer() 
    
    o = str(o)
    features = [
        (o + 'word', word),
        (o + 'pos', pos),
        (o + 'prefix1', word[:1]), 
        (o + 'prefix2', word[:2]),
        (o + 'prefix3', word[:3]),
        (o + 'prefix4', word[:4]),
        (o + 'suffix1', word[-1:]), 
        (o + 'suffix2', word[-2:]), 
        (o + 'suffix3', word[-3:]), 
        (o + 'suffix4', word[-4:]), 
        (o + 'is_upper', word.isupper()),
        (o + 'is_title', word.istitle()),
        (o + 'is_digit', word.isdigit()),
        (o + 'has_hypen', has_hyphen),
        (o + 'has_apostrophe', has_apostrophe),
        (o + 'spanich_stem', stemmer.stem(word)),
        (o + 'word_shape', word_shape(word)),
        (o + 'short_word_shape', short_word_shape(word))
    ]
    #print(features)
    return features

In [5]:
def word2features(sent, i):
    """ The function generates all features
    for the word at position i in the
    sentence."""
    features = []
    # the window around the token
    #for o in [-1,0,1]:
    for o in [-2,-1,0,1,2]:
        if i+o >= 0 and i+o < len(sent):
            word = sent[i+o][0]
            pos = sent[i+o][1]
            featlist = getfeats(word, pos, o)
            if o==0:
                if i == 0:
                    featlist.append(("beginning", True))
                elif i == len(sent)-1:
                    featlist.append(("ending", True))
            
            features.extend(featlist)
    
    return dict(features)

# Get Data, Combine Training and Validation Data for Training

In [6]:
train_sents = list(conll2002.iob_sents('esp.train'))
dev_sents = list(conll2002.iob_sents('esp.testa'))

train_sents = train_sents + dev_sents

test_sents = list(conll2002.iob_sents('esp.testb'))

In [7]:
train_feats = []
train_labels = []

In [8]:
for sent in train_sents:
    local_feats = []
    loca_labels = []
    for i in range(len(sent)):
        feats = word2features(sent,i)
        local_feats.append(feats)
        loca_labels.append(sent[i][-1])
    train_feats += [local_feats]
    train_labels += [loca_labels]

# Unconstrained CRF

In [9]:
trainer = pycrfsuite.Trainer(verbose=False)

In [10]:
for xseq, yseq in zip(train_feats, train_labels):
    trainer.append(xseq, yseq)

In [11]:
trainer.set_params({
    'c1': 0.1,   
    'c2': 0.1,  
    'max_iterations': 100,  
    'feature.possible_transitions': True
})

In [13]:
trainer.select('l2sgd')

In [14]:
trainer.train('conll2002-esp.crfsuite')

In [15]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

<contextlib.closing at 0x1a1398e630>

In [20]:
test_feats = []
test_labels = []

In [21]:
for sent in test_sents:
    local_feats = []
    loca_labels = []
    for i in range(len(sent)):
        feats = word2features(sent,i)
        local_feats.append(feats)
        loca_labels.append(sent[i][-1])
        
    test_feats += [local_feats]
    test_labels += [loca_labels]  

In [22]:
y_pred = [tagger.tag(xseq) for xseq in test_feats]

In [28]:
with open("unconstrained_results.txt", "w") as out:
    for i in range(len(test_sents)): 
        sent = test_sents[i]
        for j in range(len(sent)):
            word = sent[j][0]
            gold = sent[j][-1]
            pred = y_pred[i][j]
            out.write("{}\t{}\t{}\n".format(word,gold,pred))
    out.write("\n")