# (Work in Progress)

The data examined here comes from: https://github.com/aritter/twitter_nlp/blob/master/data/annotated/ner.txt

It is an annotated dataset of tweets often used for named-entity recognition tasks.

I am going to examine a few methods used for named entity recognition, and compare their success.

In [49]:
import pandas as pd

In [50]:
pd.read_csv("data/ritter.txt", sep = "\t")

Unnamed: 0,@paulwalk,O
0,It,O
1,'s,O
2,the,O
3,view,O
4,from,O
...,...,...
43486,come,O
43487,get,O
43488,some,O
43489,grub,O


In [52]:
tweets = []
f = open("data/ritter.txt", "r")
print(f.readline()) 

@paulwalk	O



In [102]:
tweets = []
f = open("data/ritter.txt", "r")
print(f.readlines(100)) 

['@paulwalk\tO\n', 'It\tO\n', "'s\tO\n", 'the\tO\n', 'view\tO\n', 'from\tO\n', 'where\tO\n', 'I\tO\n', "'m\tO\n", 'living\tO\n', 'for\tO\n', 'two\tO\n', 'weeks\tO\n', '.\tO\n', 'Empire\tB-facility\n']


Instead, we'll read in the data as word/tag pairs

In [106]:
tweets = []
this_tweet = []
with open('data/ritter.txt', 'r') as f:
    for line in f:
        if line == "\t\n":
            tweets.append(this_tweet)
            this_tweet = []
        else:
            parts = line.split("\t")
            word = parts[0]
            tag = parts[1].replace("\n", "")
            this_tweet.append((word, tag))

In [107]:
tweets[0]

[('@paulwalk', 'O'),
 ('It', 'O'),
 ("'s", 'O'),
 ('the', 'O'),
 ('view', 'O'),
 ('from', 'O'),
 ('where', 'O'),
 ('I', 'O'),
 ("'m", 'O'),
 ('living', 'O'),
 ('for', 'O'),
 ('two', 'O'),
 ('weeks', 'O'),
 ('.', 'O'),
 ('Empire', 'B-facility'),
 ('State', 'I-facility'),
 ('Building', 'I-facility'),
 ('=', 'O'),
 ('ESB', 'B-facility'),
 ('.', 'O'),
 ('Pretty', 'O'),
 ('bad', 'O'),
 ('storm', 'O'),
 ('here', 'O'),
 ('last', 'O'),
 ('evening', 'O'),
 ('.', 'O')]

In [109]:
len(tweets)

2394

We need to divide the tweets into a train/dev/test split.  We'll go for a 60/20/20 split as the dataset isn't huge.

In [146]:
import random
train_indices = random.sample(range(0,len(tweets)), round((len(tweets)*0.6)))
training_set = [tweets[i] for i in train_indices]
len(training_set)

1436

In [147]:
remaining_indices = [i for i in range(0,len(tweets)) if i not in train_indices]

In [150]:
dev_indices = random.sample(remaining_indices, round((len(remaining_indices)*0.5)))
dev_set = [tweets[i] for i in dev_indices]
len(dev_set)

479

In [151]:
test_indices = [i for i in remaining_indices if i not in dev_indices]
test_set = [tweets[i] for i in test_indices]
len(test_set)

479

Next we need to define some features for our dataset.  For the sake of simplicity we'll use these functions from the CRFsuite documentation.  Due to the format of the input data, I've removed the features relating to the POS tag.

In [165]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [166]:
X_train = [sent2features(s) for s in training_set]
y_train = [sent2labels(s) for s in training_set]

X_dev = [sent2features(s) for s in dev_set]
y_dev = [sent2labels(s) for s in dev_set]

The beauty of the CRF model is that it is able to use surrounding predictions to help guide the current prediction.

In [167]:
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [168]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-company',
 'B-tvshow',
 'I-tvshow',
 'B-geo-loc',
 'B-other',
 'I-other',
 'B-facility',
 'I-facility',
 'I-geo-loc',
 'B-person',
 'I-person',
 'B-product',
 'B-sportsteam',
 'I-sportsteam',
 'I-product',
 'B-musicartist',
 'I-musicartist',
 'I-company',
 'B-movie',
 'I-movie']

In [162]:
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics
y_pred = crf.predict(X_dev)
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)

0.3062546237485865

In [164]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=sorted_labels, digits=3
))

               precision    recall  f1-score   support

    B-company      0.923     0.364     0.522        33
    I-company      0.000     0.000     0.000         9
   B-facility      0.533     0.276     0.364        29
   I-facility      0.588     0.323     0.417        31
    B-geo-loc      0.682     0.283     0.400        53
    I-geo-loc      0.000     0.000     0.000        10
      B-movie      1.000     0.167     0.286         6
      I-movie      1.000     0.083     0.154        12
B-musicartist      1.000     0.182     0.308        11
I-musicartist      1.000     0.167     0.286        12
      B-other      0.167     0.068     0.097        44
      I-other      0.074     0.082     0.078        49
     B-person      0.615     0.369     0.462        65
     I-person      0.520     0.481     0.500        27
    B-product      0.714     0.227     0.345        22
    I-product      0.667     0.222     0.333        18
 B-sportsteam      0.500     0.111     0.182         9
 I-sports

  _warn_prf(average, modifier, msg_start, len(result))


Model fit isn't amazing here, and it looks like leaving out the POS tag isn't going to help with model fit.  In Part 2, I will use spaCy to help craft better features and improve my predictions.

In [171]:
import pickle
pickle.dump(training_set, open( "training_set.p", "wb"))
pickle.dump(dev_set, open( "dev_set.p", "wb"))
pickle.dump(test_set, open( "test_set.p", "wb"))