In [1]:
from nltk.corpus import conll2002
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression

In [2]:
from nltk.stem.snowball import SpanishStemmer
import re

# Helper Function for Word Shape

In [3]:
def word_shape(word):
    word = re.sub('[A-Z]|[À-Ú]', 'X', word)
    word = re.sub('[a-z]|[à-ú]', 'x', word)
    word = re.sub('[0-9]', 'd', word)
    return word

In [4]:
def short_word_shape(word):
    shape = word_shape(word)
    
    short_shape = ''
    prev_shape = ''
    is_punctuation = False

    for i in range(len(shape)):
        if shape[i] == 'X':
            curr_shape = 'X'
        elif shape[i] == 'x':
            curr_shape = 'x'
        elif shape[i] == 'd':
            curr_shape = 'd'
        else:
            is_punctuation = True
            curr_shape = shape[i]
        if is_punctuation or curr_shape != prev_shape:
            short_shape += curr_shape
            
        is_punctuation = False
        prev_shape = curr_shape
        
    return short_shape

# Get Features

In [35]:
def getfeats(word, pos, o):
    """ This takes the word in question and
    the offset with respect to the instance
    word """
    
    has_hyphen = 0
    if "-" in word:
        has_hyphen = 1
        
    has_apostrophe = 0
    if "'" in word:
        has_apostrophe = 1
        
    stemmer = SpanishStemmer() 
    
    o = str(o)
    features = [
        (o + 'word', word),
        (o + 'pos', pos),
        (o + 'prefix1', word[:1]), 
        (o + 'prefix2', word[:2]),
        (o + 'prefix3', word[:3]),
        (o + 'prefix4', word[:4]),
        (o + 'suffix1', word[-1:]), 
        (o + 'suffix2', word[-2:]), 
        (o + 'suffix3', word[-3:]), 
        (o + 'suffix4', word[-4:]), 
        (o + 'is_upper', word.isupper()),
        (o + 'is_title', word.istitle()),
        (o + 'is_digit', word.isdigit()),
        (o + 'has_hypen', has_hyphen),
        (o + 'has_apostrophe', has_apostrophe),
        (o + 'spanich_stem', stemmer.stem(word)),
        (o + 'word_shape', word_shape(word)),
        (o + 'short_word_shape', short_word_shape(word))
    ]
    #print(features)
    return features

In [25]:
def word2features(sent, i):
    """ The function generates all features
    for the word at position i in the
    sentence."""
    features = []
    # the window around the token
    #for o in [-1,0,1]:
    for o in [-2,-1,0,1,2]:
        if i+o >= 0 and i+o < len(sent):
            word = sent[i+o][0]
            pos = sent[i+o][1]
            featlist = getfeats(word, pos, o)
            if o==0:
                if i == 0:
                    featlist.append(("beginning", True))
                elif i == len(sent)-1:
                    featlist.append(("ending", True))
            
            features.extend(featlist)
    
    return dict(features)

# Get Data

In [7]:
train_sents = list(conll2002.iob_sents('esp.train'))
dev_sents = list(conll2002.iob_sents('esp.testa'))
test_sents = list(conll2002.iob_sents('esp.testb'))

# Add Features to Training Data

In [36]:
train_feats = []
train_labels = []

In [37]:
for sent in train_sents:
    for i in range(len(sent)):
        feats = word2features(sent,i)
        train_feats.append(feats)
        train_labels.append(sent[i][-1])

In [28]:
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_feats)

# Training Model

In [29]:
model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
#model = Perceptron(verbose=1)
model.fit(X_train, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

# Add Features to Validation Data

In [30]:
test_feats = []
test_labels = []

In [31]:
for sent in dev_sents:
    for i in range(len(sent)):
        feats = word2features(sent,i)
        test_feats.append(feats)
        test_labels.append(sent[i][-1])

X_test = vectorizer.transform(test_feats)

In [32]:
y_pred = model.predict(X_test)

# Get Output for Validation 

In [33]:
j = 0

In [34]:
with open("results.txt", "w") as out:
    for sent in dev_sents: 
        for i in range(len(sent)):
            word = sent[i][0]
            gold = sent[i][-1]
            pred = y_pred[j]
            j += 1
            out.write("{}\t{}\t{}\n".format(word,gold,pred))
    out.write("\n")