In [47]:
from collections import Counter
import scipy.stats
import matplotlib.pyplot as plt
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.corpus.reader.conll import ConllCorpusReader
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics
from operator import itemgetter
from random import seed
from nltk.stem import LancasterStemmer
import string, regex as re
import sklearn

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rsury\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [48]:
conll_reader = ConllCorpusReader(root='./', fileids='.conll', columntypes=('words','pos','tree','chunk','ne','srl','ignore'))

In [49]:
def read_wnut(file):
    pos = [nltk.pos_tag(sentence) for sentence in conll_reader.sents(file)]
    label = list(conll_reader.tagged_sents(file))
    return [[(word,pos,label) for (word,pos),(word,label) in zip(pos, label)] for pos,label in zip(pos,label)]

Train_sents = read_wnut("./data/wnut17train.conll")

Dev_sents = read_wnut("./data/emerging.dev.conll")

Test_sents = read_wnut("./data/emerging.test.annotated")

# Baseline Run on the dataset

In [50]:
def isHashtag(token):
    is_hash = re.compile(r'^#\p{Alnum}+$')
    return True if (is_hash.findall(token)) else False 
def isUrl(token):
    is_url = re.compile(r'^https://\S+')
    return True if (is_url.findall(token)) else False
def isUserName(token):
    is_username = re.compile(r'^@\w+')
    return True if (is_username.findall(token)) else False
def isMention(token):
    is_Mention = re.compile(r'^(RT)?@[\p{Alnum}_]+$')
    return True if (is_Mention.findall(token)) else False
    

In [63]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    # print(word, postag, i)

    features = {
        "bias": 1.0,
        'word':word,
        'len(word)': len(word),
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        "word[-3:]": word[-3:],
        "word[-2:]": word[-2:],
        "word.isURL()": isUrl(word),
        "word.isHashtag()": isHashtag(word),
        "word.isUserName()": isUserName(word),
        "word.isMention()": isMention(word),
        "word.isdigit()": word.isdigit(),
        "word.lower()": word.lower(),
        "word.istitle()": word.istitle(),
        "word.isupper()": word.isupper(),
        'word.ispunctuation()': (word in string.punctuation),
        'word.stemmed': LancasterStemmer().stem(word),
        "postag": postag,
        "postag[:2]": postag[:2],
    }

    if i > 0:
        word1 = sent[i - 1][0]
        postag1 = sent[i - 1][1]
        features.update(
            {
                # '-1:word': word1,
                # '-1:len(word)': len(word1),
                # '-1:word[:3]': word1[:3],
                # '-1:word[:2]': word1[:2],
                # "-1:word[-3:]": word1[-3:],
                # "-1:word[-2:]": word1[-2:],
                "-1:word.isURL()": isUrl(word1),
                # "-1:word.isHashtag()": isHashtag(word1),
                # "-1:word.isUserName()": isUserName(word1),
                "-1:word.isMention()": isMention(word1),
                # '-1:word.isdigit()': word1.isdigit(),
                "-1:word.lower()": word1.lower(),
                "-1:word.istitle()": word1.istitle(),
                "-1:word.isupper()": word1.isupper(),
                '-1:word.ispunctuation()': (word1 in string.punctuation),
                '-1:word.stemmed': LancasterStemmer().stem(word1),
                "-1:postag": postag1,
                "-1:postag[:2]": postag1[:2],
            }
        )
    else:
        features["BOS"] = True
        
    if i > 1 :
            word1 = sent[i - 2][0]
            postag1 = sent[i - 2][1]
            features.update(
                {
                    # '-2:word': word1,
                    # '-2:len(word)': len(word1),
                    # '-2:word[:3]': word1[:3],
                    # '-2:word[:2]': word1[:2],
                    # "-2:word[-3:]": word1[-3:],
                    # "-2:word[-2:]": word1[-2:],
                    "-2:word.isURL()": isUrl(word1),
                    # "-2:word.isHashtag()": isHashtag(word1),
                    # "-2:word.isUserName()": isUserName(word1),
                    "-2:word.isMention()": isMention(word1),
                    # '-2:word.isdigit()': word1.isdigit(),
                    "-2:word.lower()": word1.lower(),
                    "-2:word.istitle()": word1.istitle(),
                    "-2:word.isupper()": word1.isupper(),
                    '-2:word.ispunctuation()': (word1 in string.punctuation),
                    '-2:word.stemmed': LancasterStemmer().stem(word1),
                    "-2:postag": postag1,
                    "-2:postag[:2]": postag1[:2],
                }
            )
            
    if i < (len(sent) - 1):
        word1 = sent[i + 1][0]
        postag1 = sent[i + 1][1]
        features.update(
            {
                # '+1:word': word1,
                # '+1:len(word)': len(word1),
                # '+1:word[:3]': word1[:3],
                # '+1:word[:2]': word1[:2],
                # "+1:word[-3:]": word1[-3:],
                # "+1:word[-2:]": word1[-2:],
                "+1:word.isURL()": isUrl(word1),
                # "+1:word.isHashtag()": isHashtag(word1),
                # "+1:word.isUserName()": isUserName(word1),
                "+1:word.isMention()": isMention(word1),
                # '+1:word.isdigit()': word1.isdigit(),
                "+1:word.lower()": word1.lower(),
                "+1:word.istitle()": word1.istitle(),
                "+1:word.isupper()": word1.isupper(),
                '+1:word.ispunctuation()': (word1 in string.punctuation),
                '+1:word.stemmed': LancasterStemmer().stem(word1),
                "+1:postag": postag1,
                "+1:postag[:2]": postag1[:2],
            }
        )
    else:
        features["EOS"] = True
        
    if i < (len(sent)-2):
            word1 = sent[i + 2][0]
            postag1 = sent[i + 2][1]
            features.update(
                {
                    # '+2:word': word1,
                    # '+2:len(word)': len(word1),
                    # '+2:word[:3]': word1[:3],
                    # '+2:word[:2]': word1[:2],
                    # "+2:word[-3:]": word1[-3:],
                    # "+2:word[-2:]": word1[-2:],
                    "+2:word.isURL()": isUrl(word1),
                    # "+2:word.isHashtag()": isHashtag(word1),
                    # "+2:word.isUserName()": isUserName(word1),
                    "+2:word.isMention()": isMention(word1),
                    # '+2:word.isdigit()': word1.isdigit(),
                    "+2:word.lower()": word1.lower(),
                    "+2:word.istitle()": word1.istitle(),
                    "+2:word.isupper()": word1.isupper(),
                    '+2:word.ispunctuation()': (word1 in string.punctuation),
                    '+2:word.stemmed': LancasterStemmer().stem(word1),
                    "+2:postag": postag1,
                    "+2:postag[:2]": postag1[:2],
                }
            )

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [64]:
X_train = [sent2features(s) for s in Train_sents]
y_train = [sent2labels(s) for s in Train_sents]

X_test = [sent2features(s) for s in Test_sents]
y_test = [sent2labels(s) for s in Test_sents]

Dev_X_train = [sent2features(s) for s in Dev_sents]
Dev_y_train = [sent2labels(s) for s in Dev_sents]

In [65]:
seed(42)

crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs", max_iterations=100
)

params_space = {
    "c1": scipy.stats.expon(scale=0.5),
    "c2": scipy.stats.expon(scale=0.05),
    'all_possible_transitions': [True,False],
    # 'all_possible_states' : [False, True]
}

f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)

rs = RandomizedSearchCV(
    crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer
)

# rs.fit(Dev_X_train, Dev_y_train)
rs.fit(X = X_train, y = y_train, X_dev = Dev_X_train, y_dev = Dev_y_train)
# rs.fit(X_train + Dev_X_train, y_train + Dev_y_train )
# y_pred = rs.predict(X_test)


Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [66]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)  # type: ignore
metrics.flat_f1_score(y_test, y_pred, average="weighted", labels=labels)
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.412     0.049     0.088       142
I-creative-work      0.333     0.060     0.101       218
        B-group      0.276     0.048     0.082       165
        I-group      0.333     0.071     0.118        70
     B-location      0.312     0.233     0.267       150
     I-location      0.256     0.117     0.161        94
       B-person      0.527     0.159     0.244       429
       I-person      0.453     0.260     0.330       131
      B-product      0.200     0.008     0.015       127
      I-product      0.250     0.032     0.056       126

      micro avg      0.385     0.107     0.167      1740
      macro avg      0.279     0.086     0.122      1740
   weighted avg      0.352     0.107     0.154      1740



# Finding after Feature addition

In [58]:
seed(42)

crf_baseline = sklearn_crfsuite.CRF(
    algorithm="lbfgs", max_iterations=100 , all_possible_transitions=True, 
    c1=0.178, c2=0.044
)

f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted')
crf_baseline.fit(X = X_train, y = y_train, X_dev = Dev_X_train, y_dev = Dev_y_train)
# crf_baseline.fit(X_train, y_train)
# crf_baseline.fit(Dev_X_train, Dev_y_train )
labels = list(crf_baseline.classes_)
labels.remove('O')
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

y_pred_lbfgs = crf_baseline.predict(X_test)  # type: ignore

print(f"F1 score on the test set = {metrics.flat_f1_score(y_test, y_pred_lbfgs, average='weighted', labels=labels)}")
print(f"Accuracy on the test set = {metrics.flat_accuracy_score(y_test, y_pred_lbfgs)}")
print(f"Test set classification report: \n{metrics.flat_classification_report(y_test, y_pred_lbfgs, labels=sorted_labels, digits=3)}")

F1 score on the test set = 0.16340661989154961
Accuracy on the test set = 0.9284859365649312
Test set classification report: 
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.364     0.056     0.098       142
I-creative-work      0.238     0.046     0.077       218
        B-group      0.296     0.048     0.083       165
        I-group      0.467     0.100     0.165        70
     B-location      0.366     0.273     0.313       150
     I-location      0.333     0.128     0.185        94
       B-person      0.554     0.168     0.258       429
       I-person      0.493     0.260     0.340       131
      B-product      0.200     0.008     0.015       127
      I-product      0.267     0.032     0.057       126

      micro avg      0.408     0.113     0.177      1740
      macro avg      0.298     0.093     0.132      1740
   weighted avg  

In [56]:
seed(42)
l2 = sklearn_crfsuite.CRF(algorithm='l2sgd', max_iterations=100, 
                          all_possible_states=False, 
                          all_possible_transitions=False, c2=0.012683987275139579)
l2.fit(X_train, y_train)
# l2.fit(X_train + Dev_X_train, y_train + Dev_y_train)
# l2.fit(X = X_train, y = y_train, X_dev = Dev_X_train, y_dev = Dev_y_train)
# l2.fit(Dev_X_train, Dev_y_train)

labels = list(l2.classes_)
labels.remove('O')

sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
ypred = l2.predict(X_test)

print(f"F1 score on the test set = {metrics.flat_f1_score(y_test, ypred, average='weighted', labels=labels)}")
print(f"Accuracy on the test set = {metrics.flat_accuracy_score(y_test, ypred)}")
print(f"Test set classification report: \n{metrics.flat_classification_report(y_test, ypred, labels=sorted_labels, digits=3)}")

F1 score on the test set = 0.12021132644424928
Accuracy on the test set = 0.929212618620159


  _warn_prf(average, modifier, msg_start, len(result))


Test set classification report: 
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.500     0.028     0.053       142
I-creative-work      0.571     0.018     0.036       218
        B-group      0.286     0.012     0.023       165
        I-group      0.250     0.029     0.051        70
     B-location      0.373     0.207     0.266       150
     I-location      0.458     0.117     0.186        94
       B-person      0.628     0.114     0.193       429
       I-person      0.600     0.206     0.307       131
      B-product      0.000     0.000     0.000       127
      I-product      1.000     0.024     0.047       126

      micro avg      0.498     0.076     0.133      1740
      macro avg      0.389     0.063     0.097      1740
   weighted avg      0.479     0.076     0.120      1740



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
