In [38]:
import codecs
import numpy as np
from nltk.tag import pos_tag
import pycrfsuite
from bs4 import BeautifulSoup
from bs4.element import Tag
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [56]:
with codecs.open("500newsgoldstandard.xml", "r", "utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

In [57]:
print(soup.prettify())

<?xml version="1.0" encoding="UTF-8"?>
<corpus xmlns="http://semweb.unister.de/xml-corpus-schema-2013">
 <document id="0">
  <textwithnamedentities>
   <simpletextpart>
    The U.S. Patent Office allows genes to be patented as soon as someone isolates the DNA by removing it from the cell , says
   </simpletextpart>
   <namedentityintext uri="http://dbpedia.org/resource/American_Civil_Liberties_Union">
    ACLU
   </namedentityintext>
   <simpletextpart>
    attorney
   </simpletextpart>
   <namedentityintext uri="http://aksw.org/notInWiki/Sandra_Park">
    Sandra Park
   </namedentityintext>
   <simpletextpart>
    .
   </simpletextpart>
  </textwithnamedentities>
 </document>
 <document id="1">
  <textwithnamedentities>
   <simpletextpart>
    `` Supporters of these programs make completely unfounded generalizations about boys and girls , but offer no proof that these strategies pay off academically , ''
   </simpletextpart>
   <namedentityintext uri="http://dbpedia.org/resource/Ameri

In [58]:
docs = []
for elem in soup.find_all("document"):
    texts = []
    for child in elem.find("textwithnamedentities").children:
        if type(child) == Tag:
            if child.name == "namedentityintext":
                label = 'N' 
            else:
                label = 'C' 
            for w in child.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)

In [59]:
docs[0]

[('The', 'C'),
 ('U.S.', 'C'),
 ('Patent', 'C'),
 ('Office', 'C'),
 ('allows', 'C'),
 ('genes', 'C'),
 ('to', 'C'),
 ('be', 'C'),
 ('patented', 'C'),
 ('as', 'C'),
 ('soon', 'C'),
 ('as', 'C'),
 ('someone', 'C'),
 ('isolates', 'C'),
 ('the', 'C'),
 ('DNA', 'C'),
 ('by', 'C'),
 ('removing', 'C'),
 ('it', 'C'),
 ('from', 'C'),
 ('the', 'C'),
 ('cell', 'C'),
 (',', 'C'),
 ('says', 'C'),
 ('ACLU', 'N'),
 ('attorney', 'C'),
 ('Sandra', 'N'),
 ('Park', 'N'),
 ('.', 'C')]

In [60]:
data = []
for i, doc in enumerate(docs):
    tokens = [t for t, label in doc]
    tagged = pos_tag(tokens)
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

In [61]:
data[0]

[('The', 'DT', 'C'),
 ('U.S.', 'NNP', 'C'),
 ('Patent', 'NNP', 'C'),
 ('Office', 'NNP', 'C'),
 ('allows', 'VBZ', 'C'),
 ('genes', 'NNS', 'C'),
 ('to', 'TO', 'C'),
 ('be', 'VB', 'C'),
 ('patented', 'VBN', 'C'),
 ('as', 'RB', 'C'),
 ('soon', 'RB', 'C'),
 ('as', 'IN', 'C'),
 ('someone', 'NN', 'C'),
 ('isolates', 'VBZ', 'C'),
 ('the', 'DT', 'C'),
 ('DNA', 'NN', 'C'),
 ('by', 'IN', 'C'),
 ('removing', 'VBG', 'C'),
 ('it', 'PRP', 'C'),
 ('from', 'IN', 'C'),
 ('the', 'DT', 'C'),
 ('cell', 'NN', 'C'),
 (',', ',', 'C'),
 ('says', 'VBZ', 'C'),
 ('ACLU', 'NNP', 'N'),
 ('attorney', 'NN', 'C'),
 ('Sandra', 'NNP', 'N'),
 ('Park', 'NNP', 'N'),
 ('.', '.', 'C')]

In [62]:
# def word2features(sent, i):
#     word = sent[i][0]
#     postag = sent[i][1]
    
#     features = {
#         'bias': 1.0, 
#         'word.lower()': word.lower(), 
#         'word[-3:]': word[-3:],
#         'word[-2:]': word[-2:],
#         'word.isupper()': word.isupper(),
#         'word.istitle()': word.istitle(),
#         'word.isdigit()': word.isdigit(),
#         'postag': postag,
#         'postag[:2]': postag[:2],
#     }
#     if i > 0:
#         word1 = sent[i-1][0]
#         postag1 = sent[i-1][1]
#         features.update({
#             '-1:word.lower()': word1.lower(),
#             '-1:word.istitle()': word1.istitle(),
#             '-1:word.isupper()': word1.isupper(),
#             '-1:postag': postag1,
#             '-1:postag[:2]': postag1[:2],
#         })
#     else:
#         features['BOS'] = True
#     if i < len(sent)-1:
#         word1 = sent[i+1][0]
#         postag1 = sent[i+1][1]
#         features.update({
#             '+1:word.lower()': word1.lower(),
#             '+1:word.istitle()': word1.istitle(),
#             '+1:word.isupper()': word1.isupper(),
#             '+1:postag': postag1,
#             '+1:postag[:2]': postag1[:2],
#         })
#     else:
#         features['EOS'] = True

#     return features

In [67]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not 
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [68]:
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]


X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

trainer = pycrfsuite.Trainer(verbose=True)

In [69]:
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 14752
Seconds required: 0.050

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 6251.523999
Feature norm: 1.000000
Error norm: 6635.933753
Active features: 13918
Line search trials: 1
Line search step: 0.000045
Seconds required for this iteration: 0.010

***** Iteration #2 *****
Loss: 5121.044963
Feature norm: 0.818911
Error norm: 5874.272204
Active features: 14224
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #3 *****
Loss: 4302.869382
Feature norm: 0.541265
Error norm: 9364.848774
Active features: 13999
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

***** Iteration #58 *****
Loss: 434.365879
Feature norm: 49.030614
Error norm: 65.361189
Active features: 4984
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #59 *****
Loss: 431.462820
Feature norm: 49.490962
Error norm: 49.470468
Active features: 4797
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #60 *****
Loss: 428.991608
Feature norm: 49.702479
Error norm: 53.821615
Active features: 4693
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #61 *****
Loss: 425.874200
Feature norm: 50.093602
Error norm: 31.750028
Active features: 4511
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #62 *****
Loss: 423.305273
Feature norm: 50.439537
Error norm: 82.970747
Active features: 4431
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #133 *****
Loss: 397.198701
Feature norm: 55.400018
Error norm: 27.623341
Active features: 3161
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #134 *****
Loss: 397.130832
Feature norm: 55.412574
Error norm: 18.495434
Active features: 3164
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #135 *****
Loss: 397.095165
Feature norm: 55.442490
Error norm: 31.485211
Active features: 3171
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #136 *****
Loss: 397.018247
Feature norm: 55.455137
Error norm: 15.198176
Active features: 3162
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #137 *****
Loss: 396.988668
Feature norm: 55.484613
Error norm: 26.529618
Active features: 3158
Line search trials: 1
Line search step: 1.000000
Seconds required fo

In [72]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
# i = 12
# for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
#     print("%s (%s)" % (y, x))

# Create a mapping of labels to indices
labels = {"N": 1, "C": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["C", "N"]))

ryan (N)
and (N)
gop (N)
presidential (C)
candidate (C)
mitt (N)
romney (N)
have (C)
attacked (C)
obama (C)
recently (C)
, (C)
including (C)
earlier (C)
in (C)
the (C)
day (C)
at (C)
his (C)
north (C)
canton (C)
rally (C)
, (C)
for (C)
taking (C)
medicare (C)
funds (C)
to (C)
help (C)
pay (C)
for (C)
the (C)
health (C)
care (C)
overhaul (C)
, (C)
even (C)
though (C)
ryan (C)
's (C)
own (C)
budget (C)
proposal (C)
included (C)
the (C)
same (C)
cut (C)
. (C)
              precision    recall  f1-score   support

           C       0.97      0.97      0.97      2455
           N       0.78      0.78      0.78       366

   micro avg       0.94      0.94      0.94      2821
   macro avg       0.88      0.87      0.87      2821
weighted avg       0.94      0.94      0.94      2821

