In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

from nltk.tag.stanford import StanfordPOSTagger
from stanford_postagger.stanford_wrapper import StanfordPOSTagger as StanfordPOSTaggerWrapper

from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

import scipy
from sklearn.grid_search import RandomizedSearchCV



In [2]:
%load_ext autoreload
%autoreload 2

# Read Dataset

In [3]:
f = open('datasets/conll2003/train.txt', 'r')
lines = f.readlines()
f.close()

del lines[0]
del lines[0]

dataset = []
sentence = []
for line in lines:
    splitter = line.strip().split(' ')
    if splitter[0] == '':
        continue
    elif (splitter[0] == '-DOCSTART-'):
        dataset.append(sentence)
        sentence = []
    else:
        token = splitter[0]
        tag = splitter[3]
        sentence.append((token, tag))

In [4]:
def convert_conlltxt2dataset(filename):
    f = open(filename, 'r')
    lines = f.readlines()
    f.close()
    
    del lines[0]
    del lines[0]
    
    dataset = []
    sentence = []
    for line in lines:
        splitter = line.strip().split(' ')
        if splitter[0] == '':
            continue
        elif (splitter[0] == '-DOCSTART-'):
            dataset.append(sentence)
            sentence = []
        else:
            token = splitter[0]
            tag = splitter[3]
            sentence.append((token, tag))
    return dataset

In [5]:
train_dataset = convert_conlltxt2dataset('datasets/conll2003/train.txt')
validation_dataset = convert_conlltxt2dataset('datasets/conll2003/valid.txt')
test_dataset = convert_conlltxt2dataset('datasets/conll2003/test.txt')

In [6]:
train_dataset[0][0:5]

[('EU', 'B-ORG'),
 ('rejects', 'O'),
 ('German', 'B-MISC'),
 ('call', 'O'),
 ('to', 'O')]

# Add Postag to Dataset

## Example

In [7]:
postagger = StanfordPOSTaggerWrapper()
postag = postagger.tag('+44 171')
postag

[('+44', 'CD'), ('171', 'CD')]

In [8]:
def add_postag2dataset(dataset):
    postagger = StanfordPOSTaggerWrapper()
    dataset_with_postag = []
    for sent in dataset:
        postagged_sent = []
        for index, (token, tag) in enumerate(sent):
            postagged_token = postagger.tag(token)
            postagged_sent.append((token, postagged_token[0][1], tag))
        dataset_with_postag.append(postagged_sent)
        
    return dataset_with_postag

postagged_train_dataset = add_postag2dataset(train_dataset)
postagged_validation_dataset = add_postag2dataset(validation_dataset)
postagged_test_dataset = add_postag2dataset(test_dataset)

# Delete Unused Dataset
del train_dataset
del validation_dataset
del test_dataset

In [9]:
postagged_train_dataset[0][0:5]

[('EU', 'NNP', 'B-ORG'),
 ('rejects', 'VBZ', 'O'),
 ('German', 'JJ', 'B-MISC'),
 ('call', 'NN', 'O'),
 ('to', 'TO', 'O')]

# Extract Feature

In [10]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    # Ortographic Feature, Word, POSTag & N-Gram
    features = {
        'word': word,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:2]': word[:2],
        'word[:3]': word[:3],
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isupper()': word.isupper(),
        'postag': postag,
        'postag[:2]': postag[:2]
    }
    
    # Position
    features.update({
        'pos_front': i,
        'pos_end': len(sent) - i
    })
    
    # Bag Of Words
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2postag(sent):
    return [postag for token, postag, label in sent]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [11]:
sent2features(postagged_train_dataset[0])[0]

{'word': 'EU',
 'word.lower()': 'eu',
 'word[-3:]': 'EU',
 'word[-2:]': 'EU',
 'word[:2]': 'EU',
 'word[:3]': 'EU',
 'word.istitle()': False,
 'word.isdigit()': False,
 'word.isupper()': True,
 'postag': 'NNP',
 'postag[:2]': 'NN',
 'pos_front': 0,
 'pos_end': 469,
 'BOS': True,
 '+1:word.lower()': 'rejects',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'VBZ',
 '+1:postag[:2]': 'VB'}

# Feature Extraction

In [12]:
X_train = [sent2features(sent) for sent in postagged_train_dataset]
y_train = [sent2labels(sent) for sent in postagged_train_dataset]

X_val = [sent2features(sent) for sent in postagged_validation_dataset]
y_val = [sent2labels(sent) for sent in postagged_validation_dataset]

X_test = [sent2features(sent) for sent in postagged_test_dataset]
y_test = [sent2labels(sent) for sent in postagged_test_dataset]

del postagged_train_dataset
del postagged_validation_dataset
del postagged_test_dataset

In [13]:
X_train[0][0]

{'word': 'EU',
 'word.lower()': 'eu',
 'word[-3:]': 'EU',
 'word[-2:]': 'EU',
 'word[:2]': 'EU',
 'word[:3]': 'EU',
 'word.istitle()': False,
 'word.isdigit()': False,
 'word.isupper()': True,
 'postag': 'NNP',
 'postag[:2]': 'NN',
 'pos_front': 0,
 'pos_end': 469,
 'BOS': True,
 '+1:word.lower()': 'rejects',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'VBZ',
 '+1:postag[:2]': 'VB'}

# Load Word Embedding and Add to The Feature

In [14]:
def load_word_embedding_model(filename):
    f = open(filename, 'r', encoding='utf-8')
    lines = f.readlines()
    del lines[0]

    embedding_dict = {}
    counter = 0
    for line in lines:
        line = line.strip()
        arr = line.split(' ')
        word = arr[0]
        vectors = [float(x) for x in arr[1:]]
        embedding_dict[word] = vectors
    return embedding_dict

def add_word_embedding2feature(embedding_dict, feature):
    VECTOR_SPACE_SIZE = 50
    for i, sentence in enumerate(feature):
        for j, token in enumerate(sentence):
            word = token['word']
            vector = []
            if word in embedding_dict:
                vector = embedding_dict[word]
            else:
                vector = [0 for i in range(VECTOR_SPACE_SIZE)]
            for k in range(len(vector)):
                wk = 'w{}'.format(k+1)
                feature[i][j][wk] = vector[k]
    return feature

from copy import deepcopy

In [15]:
cbow_embedding_dict = load_word_embedding_model('models/word_embedding/cbow_model_2.txt')
X_cbow = deepcopy(X_train)
X_cbow = add_word_embedding2feature(cbow_embedding_dict, X_cbow)

del cbow_embedding_dict

In [16]:
sg_embedding_dict = load_word_embedding_model('models/word_embedding/sg_model_2.txt')
X_sg = deepcopy(X_train)
X_sg = add_word_embedding2feature(sg_embedding_dict, X_sg)

del sg_embedding_dict

In [17]:
fasttext_embedding_dict = load_word_embedding_model('models/word_embedding/fasttext_model_2.txt')
X_fasttext = deepcopy(X_train)
X_fasttext = add_word_embedding2feature(fasttext_embedding_dict, X_fasttext)

del fasttext_embedding_dict

In [18]:
del X_train

In [19]:
X_cbow[0][0]

{'word': 'EU',
 'word.lower()': 'eu',
 'word[-3:]': 'EU',
 'word[-2:]': 'EU',
 'word[:2]': 'EU',
 'word[:3]': 'EU',
 'word.istitle()': False,
 'word.isdigit()': False,
 'word.isupper()': True,
 'postag': 'NNP',
 'postag[:2]': 'NN',
 'pos_front': 0,
 'pos_end': 469,
 'BOS': True,
 '+1:word.lower()': 'rejects',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'VBZ',
 '+1:postag[:2]': 'VB',
 'w1': -0.10382087,
 'w2': 0.06820139,
 'w3': 0.09168298,
 'w4': -0.18774457,
 'w5': -1.17892,
 'w6': -0.19639602,
 'w7': 0.11287989,
 'w8': 0.4286515,
 'w9': 0.13554206,
 'w10': -0.16998494,
 'w11': -0.6624491,
 'w12': -0.2932335,
 'w13': -0.19169496,
 'w14': -0.217495,
 'w15': -0.44583368,
 'w16': 0.97934705,
 'w17': 0.33488846,
 'w18': -0.5221745,
 'w19': -0.90460926,
 'w20': -0.9685539,
 'w21': 1.2006509,
 'w22': -0.001496821,
 'w23': -0.37980583,
 'w24': -1.235236,
 'w25': -0.42509484,
 'w26': 0.2744287,
 'w27': 1.0390763,
 'w28': 0.46746588,
 'w29': 0.1513747,
 'w30': 1.079

In [20]:
X_sg[0][0]

{'word': 'EU',
 'word.lower()': 'eu',
 'word[-3:]': 'EU',
 'word[-2:]': 'EU',
 'word[:2]': 'EU',
 'word[:3]': 'EU',
 'word.istitle()': False,
 'word.isdigit()': False,
 'word.isupper()': True,
 'postag': 'NNP',
 'postag[:2]': 'NN',
 'pos_front': 0,
 'pos_end': 469,
 'BOS': True,
 '+1:word.lower()': 'rejects',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'VBZ',
 '+1:postag[:2]': 'VB',
 'w1': -0.12734045,
 'w2': -0.17149775,
 'w3': -0.14360633,
 'w4': -0.019517161,
 'w5': -0.28408036,
 'w6': 0.061121225,
 'w7': 0.17245711,
 'w8': -0.43180206,
 'w9': 0.015727954,
 'w10': 0.22075531,
 'w11': -0.25180316,
 'w12': 0.08385932,
 'w13': 0.09272004,
 'w14': 0.0015021019,
 'w15': 0.012492577,
 'w16': 1.1178739,
 'w17': -0.3847049,
 'w18': -0.1323174,
 'w19': -0.22730935,
 'w20': -0.5352181,
 'w21': 1.4322983,
 'w22': 0.43445793,
 'w23': -0.20723875,
 'w24': -0.6101142,
 'w25': -0.3060098,
 'w26': -0.4612437,
 'w27': 1.0669663,
 'w28': 0.09718897,
 'w29': 0.1397241,
 'w3

In [21]:
X_fasttext[0][0]

{'word': 'EU',
 'word.lower()': 'eu',
 'word[-3:]': 'EU',
 'word[-2:]': 'EU',
 'word[:2]': 'EU',
 'word[:3]': 'EU',
 'word.istitle()': False,
 'word.isdigit()': False,
 'word.isupper()': True,
 'postag': 'NNP',
 'postag[:2]': 'NN',
 'pos_front': 0,
 'pos_end': 469,
 'BOS': True,
 '+1:word.lower()': 'rejects',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'VBZ',
 '+1:postag[:2]': 'VB',
 'w1': 0.10213207,
 'w2': 0.027069703,
 'w3': 0.07607197,
 'w4': 0.019441742,
 'w5': 0.05452568,
 'w6': -0.054628372,
 'w7': -0.00013970374,
 'w8': -0.19305748,
 'w9': 0.044209644,
 'w10': 0.045331083,
 'w11': -0.004818771,
 'w12': 0.09296371,
 'w13': 0.05386617,
 'w14': 0.018197853,
 'w15': -0.0029894128,
 'w16': 0.03327747,
 'w17': -0.064056106,
 'w18': -0.0983447,
 'w19': -0.031191973,
 'w20': 0.00041868974,
 'w21': -0.115362,
 'w22': 0.0030208372,
 'w23': -0.039958715,
 'w24': -0.0072136535,
 'w25': -0.03579684,
 'w26': 0.00024644737,
 'w27': -0.023796747,
 'w28': -0.13226575

# Train Classifier Using Best Parameter

In [22]:
c1_ = 0.001262621084804322
c2_ = 0.07748342053200617

In [23]:
%%time
crf_cbow = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=c1_,
    c2=c1_,
    max_iterations=100,
    all_possible_transitions=True
)
crf_cbow.fit(X_cbow, y_train)

CPU times: user 59.3 s, sys: 1.28 s, total: 1min
Wall time: 1min


In [24]:
%%time
crf_sg = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=c1_,
    c2=c1_,
    max_iterations=100,
    all_possible_transitions=True
)
crf_sg.fit(X_sg, y_train)

CPU times: user 1min 2s, sys: 586 ms, total: 1min 2s
Wall time: 1min 2s


In [25]:
%%time
crf_fasttext = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=c1_,
    c2=c1_,
    max_iterations=100,
    all_possible_transitions=True
)
crf_fasttext.fit(X_fasttext, y_train)

CPU times: user 59.7 s, sys: 336 ms, total: 60 s
Wall time: 60 s


# Save Model

In [26]:
import pickle

In [27]:
filename_cbow = 'temp_crf_cbow_2.sav'
pickle.dump(crf_cbow, open(filename_cbow, 'wb'))

filename_sg = 'temp_crf_sg_2.sav'
pickle.dump(crf_sg, open(filename_sg, 'wb'))

filename_fasttext = 'temp_crf_fasttext_2.sav'
pickle.dump(crf_fasttext, open(filename_fasttext, 'wb'))