In [1]:
import numpy as np
import sklearn_crfsuite
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [113]:
import warnings
warnings.filterwarnings('ignore')

## Data

Please see *generate-dataset.ipynb* to get information how dataset is generated. 

### Load dataset

In [2]:
import json

In [3]:
def load_json(file):
    with open(file) as json_file:
        data = json.load(json_file)
    
    return data

In [4]:
train_dataset_file = "data/brown-dataset-train.json"
val_dataset_file = "data/brown-dataset-val.json"
test_dataset_file = "data/brown-dataset-test.json"

In [5]:
train_dataset = load_json(train_dataset_file)
test_dataset = load_json(test_dataset_file)
val_dataset = load_json(val_dataset_file)

In [6]:
len(train_dataset)

15805

In [7]:
len(test_dataset)

6863

In [8]:
def split_dataset_into_X_and_y(dataset):
    X = [np.array(sent)[:, 0].tolist() for sent in dataset]
    y = [np.array(sent)[:, 1].tolist() for sent in dataset]

    return X, y

In [9]:
X_train, y_train = split_dataset_into_X_and_y(train_dataset)
X_val, y_val = split_dataset_into_X_and_y(val_dataset)
X_test, y_test = split_dataset_into_X_and_y(test_dataset)

### N-Gram

In [10]:
def read_file(file):
    result = []
    with open(file) as f:
        lines = f.readlines()
        for line in lines:
            l = line.strip()
            if len(l) > 0:
                result.append(l)
    return result

In [11]:
def read_n_gram_file(ngram_file):
    result = dict()
    lines = read_file(ngram_file)
    for line in lines:
        line_split = line.split('\t')
        key = ' '.join(line_split[1:])
        result[key] = line_split[0]
        
    return result

In [12]:
def read_pos_ngram_file(pos_ngram_file):
    ngram_dict = dict()
    
    ngram_str_dict = load_json(pos_ngram_file)
    for ngram_str in ngram_str_dict:
        ngram = tuple(ngram_str.split('_'))
        ngram_dict[ngram] = ngram_str_dict[ngram_str]
    
    return ngram_dict

Here, we use ngrams downloaded from here: https://www.ngrams.info/download_coca.asp

In [13]:
bi_gram_dict = read_n_gram_file("data/w2.txt")
three_gram_dict = read_n_gram_file("data/w3.txt")
four_gram_dict = read_n_gram_file("data/w4.txt")
five_gram_dict = read_n_gram_file("data/w5.txt")

In [14]:
len(bi_gram_dict)

1055386

In [15]:
ngrams_dict_list = [bi_gram_dict, three_gram_dict, four_gram_dict, five_gram_dict]

In [16]:
import itertools
dict(itertools.islice(three_gram_dict.items(), 10))

{'a B.A. degree': '48',
 'a B.A. in': '56',
 'a B.S. in': '41',
 'a BA in': '33',
 'a babble of': '28',
 'a babe in': '31',
 'a baby and': '308',
 'a baby at': '72',
 'a baby before': '29',
 'a baby bird': '50'}

#### pos ngram

In [17]:
pos_bigram_dict = read_pos_ngram_file('data/pos-bigrams.json')

In [18]:
dict(itertools.islice(pos_bigram_dict.items(), 10))

{('RB', 'NN'): 0.3909020817270625,
 ('NN', 'VBD'): 4.067077872012336,
 ('VBD', 'RB'): 2.953739398612182,
 ('RB', 'JJ'): 3.1912104857363146,
 ('DT', 'NNP'): 5.394757131842714,
 ('NNP', 'NNP'): 8.609868928296068,
 ('NNP', ','): 4.376252891287587,
 (',', 'WDT'): 0.4225134926754048,
 ('WDT', 'VBZ'): 0.6260601387818041,
 ('VBZ', 'NN'): 0.3454124903623747}

In [19]:
pos_thregram_dict = read_pos_ngram_file('data/pos-threegrams.json')

In [20]:
dict(itertools.islice(pos_thregram_dict.items(), 10))

{('RB', 'NN', 'VBD'): 0.001166216547999018,
 ('NN', 'VBD', 'RB'): 0.04615762337343481,
 ('VBD', 'RB', 'JJ'): 0.033881659710287254,
 ('DT', 'NNP', 'NNP'): 0.13914804812177756,
 ('NNP', 'NNP', ','): 0.13037073410262706,
 ('NNP', ',', 'WDT'): 0.006506260741468205,
 (',', 'WDT', 'VBZ'): 0.010434569113675423,
 ('WDT', 'VBZ', 'NN'): 0.0015958752762091823,
 ('VBZ', 'NN', ','): 0.002332433095998036,
 ('NN', ',', 'VBZ'): 0.01853670513135281}

## Classification

### Metric

The main metric for this task is F1-score.

In [21]:
class Metric:
    
    @staticmethod
    def calc_classification_report(y_true, y_pred):
        y_true_flatten, y_pred_flatten = Metric.__make_y_flatten(y_true, y_pred)
        return classification_report(y_true_flatten, y_pred_flatten)
    
    @staticmethod
    def calc_f1_score_macro_for_class(y_true, y_pred, class_name):
        y_true_flatten, y_pred_flatten = Metric.__make_y_flatten(y_true, y_pred)

        f1_score = metrics.f1_score(y_true_flatten, y_pred_flatten, labels=[class_name], average='macro')
        return f1_score
        
        
    
    @staticmethod
    def __make_y_flatten(y_true, y_pred):
        assert len(y_true) == len(y_pred)

        y_pred_flatten, y_true_flatten = [], []
        for i in range(0, len(y_true)):
            y_true_flatten += y_true[i]
            y_pred_flatten += y_pred[i]
        
        assert len(y_true_flatten) == len(y_pred_flatten)
        
        return y_true_flatten, y_pred_flatten

## Baseline classifier

In [139]:
class CRFClassifier:
    def __init__(self):
        self._crf = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=0.41083349574889616,
            c2=0.06273667997796518,
            max_iterations=1000,
            all_possible_transitions=False)        
        
        
    def fit(self, X, y):
        x_train = self.sentences2feature_matrix(X)
        self._crf.fit(x_train, y)
        
    
    def predict(self, X):
        x_test = self.sentences2feature_matrix(X)
        return self._crf.predict(x_test)

        
        
    def sentences2feature_matrix(self, X_sentences):
        return [self.sent2features(s) for s in X_sentences]
        
    def sent2features(self, sent):
        return [self.word2features(sent, i) for i in range(len(sent))]
    
    def word2features(self, sent, pos):
        word = sent[pos]

        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
        }
        return features
    
    @property
    def crf(self):
        return self._crf

In [140]:
crf_classifier = CRFClassifier()

In [141]:
crf_classifier.fit(X_train, y_train)

In [142]:
y_pred = crf_classifier.predict(X_test)

In [143]:
report = Metric.calc_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.97      1.00      0.98    302252
        True       0.28      0.00      0.00      9678

    accuracy                           0.97    311930
   macro avg       0.63      0.50      0.49    311930
weighted avg       0.95      0.97      0.95    311930



## Improve baseline classifier

In [144]:
class RunOnSentenceCRFClassifier(CRFClassifier):
    def __init__(self):
        super(RunOnSentenceCRFClassifier, self).__init__()
        
    def word2features(self, sent, pos):
        word = sent[pos]

        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
        }
        if pos > 0:
            pass
            word1 = sent[pos-1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.istitle()': word1.istitle(),
                '-1:word.isupper()': word1.isupper(),
            })
        else:
            features['BOS'] = True

        if pos < len(sent)-1:
            word1 = sent[pos+1]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
            })
        else:
            features['EOS'] = True
            
        return features
 

In [145]:
run_on_sent_crf_classifier = RunOnSentenceCRFClassifier()

In [146]:
run_on_sent_crf_classifier.fit(X_train, y_train)

In [147]:
y_pred = run_on_sent_crf_classifier.predict(X_test)

In [148]:
report = Metric.calc_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.98      1.00      0.99    302252
        True       0.78      0.48      0.60      9678

    accuracy                           0.98    311930
   macro avg       0.88      0.74      0.79    311930
weighted avg       0.98      0.98      0.98    311930



As we can see, adding features for next and previous words improved F1-score for True class significantly.es for next and previous words improved F1-socre for True class significantlly. 

#### Add n-gram occurrences as features

In [149]:
class NGramRunOnSentenceCRFClassifier(RunOnSentenceCRFClassifier):
    def __init__(self, ngrams_dict_list):
        super(NGramRunOnSentenceCRFClassifier, self).__init__()
        self.ngrams_dict_list = ngrams_dict_list
        
        
    def sent2features(self, sent):
        result = []
        for i in range(0, len(sent)):
            features = self.word2features(sent, i)
            self.__update_features_with_nagram(features, sent, i)
            if i > 0:
                self.__update_features_with_nagram(features, sent, i - 1, suffix="-1")
            
            if i < len(sent) - 1:
                self.__update_features_with_nagram(features, sent, i + 1, suffix="+1")
                
            result.append(features)
            
        return result
                
    def __update_features_with_nagram(self, features, sent, pos, suffix=""):
        i = pos
        while i < len(sent) - 1 and i - pos < len(self.ngrams_dict_list):
            ngram = ' '.join(sent[pos: i - pos + 1])


            n = i - pos + 2
            if ngram in self.ngrams_dict_list[i - pos]:
#                 print(ngram, self.ngrams_dict_list[i - pos][ngram])
                features[f"{n}-gram-{suffix}"] = int(self.ngrams_dict_list[i - pos][ngram])
            else:
                features[f"{n}-gram"] = 0

            i += 1

In [150]:
ngram_crf_classifier = NGramRunOnSentenceCRFClassifier(ngrams_dict_list)

In [151]:
ngram_crf_classifier.fit(X_train, y_train)

In [152]:
y_pred = ngram_crf_classifier.predict(X_test)

In [153]:
report = Metric.calc_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.98      1.00      0.99    302252
        True       0.78      0.48      0.60      9678

    accuracy                           0.98    311930
   macro avg       0.88      0.74      0.79    311930
weighted avg       0.98      0.98      0.98    311930



Addinfg ngram occurences as a features did not  improved F1-score compared with previous example.

In [154]:
import eli5

In [157]:
eli5.show_weights(ngram_crf_classifier.crf)

From \ To,False,True
False,1.205,-0.15
True,-0.034,-3.274

Weight?,Feature
Weight?,Feature
+9.320,"word.lower():,"
+7.524,word.lower()::
+7.008,word.lower():as
+6.844,word.lower():(
+6.093,word.lower():when
+6.011,+1:word.lower():.
+5.425,word.lower():or
+5.405,word.lower():if
+5.330,word.lower():than
+4.925,word.lower():because

Weight?,Feature
+9.320,"word.lower():,"
+7.524,word.lower()::
+7.008,word.lower():as
+6.844,word.lower():(
+6.093,word.lower():when
+6.011,+1:word.lower():.
+5.425,word.lower():or
+5.405,word.lower():if
+5.330,word.lower():than
+4.925,word.lower():because

Weight?,Feature
+5.782,+1:word.lower():proprietorships
+5.223,+1:word.lower():venn
+5.080,word.lower():bong
+4.763,word.lower():wash.
+4.695,+1:word.lower():shotgun-type
+4.642,-1:word.lower():whisked
+4.639,+1:word.lower():robby's
+4.604,+1:word.lower():leninism-marxism
+4.576,+1:word.lower():libertines
+4.546,word.lower():'?'


From the features importance table represented above, we can notice that ngrams features has no siFrom the features importance table represented above, we can notice that ngrams features has no significance importance. Probably, the corpus of ngrams is not matched so good with training corpus.gnificancy importancy. Probably, the corpus of ngrams is not matched so good with training corpus.

It does not make sense to consider such kind of n-gram features.

#### Add part of speech as features

In [77]:
import nltk

In [158]:
class POSRunOnSentenceCRFClassifier(RunOnSentenceCRFClassifier):
    def __init__(self):
        super(POSRunOnSentenceCRFClassifier, self).__init__()
        
    def sent2features(self, sent):
        result = []
        pos_tags = nltk.pos_tag(sent)
        assert len(pos_tags) == len(sent)
        for i in range(0, len(sent)):
            features = self.word2features(sent, i)
            features['pos-tag'] = pos_tags[i][1]
            result.append(features)
            
        return result

In [160]:
pos_tag_crf_classifier = POSRunOnSentenceCRFClassifier()

In [161]:
pos_tag_crf_classifier.fit(X_train, y_train)

In [162]:
y_pred = pos_tag_crf_classifier.predict(X_test)

In [163]:
report = Metric.calc_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.98      1.00      0.99    302252
        True       0.77      0.50      0.60      9678

    accuracy                           0.98    311930
   macro avg       0.88      0.75      0.80    311930
weighted avg       0.98      0.98      0.98    311930



Adding pos tag as features improved recall a little bit. 

#### Add part of speech ngram probabilities as features

In [175]:
class POSNGramRunOnSentenceCRFClassifier(RunOnSentenceCRFClassifier):
    def __init__(self, pos_bigram_dict, pos_threegram_dict):
        super(POSNGramRunOnSentenceCRFClassifier, self).__init__()
        self.pos_bigram_dict = pos_bigram_dict
        self.pos_threegram_dict = pos_threegram_dict
        
    def sent2features(self, sent):
        result = []
        pos_tags = nltk.pos_tag(sent)
        assert len(pos_tags) == len(sent)
        sent_size = len(sent)
        result = []
        for i in range(0, sent_size):
            features = self.word2features(sent, i)
            features['pos-tag'] = pos_tags[i][1]
            
            if i < sent_size - 2:
                pos_threegram = (pos_tags[i][1], pos_tags[i+1][1], pos_tags[i+2][1])
                
                if pos_threegram in self.pos_threegram_dict:
                    features['pos-threegram-prob'] = self.pos_threegram_dict[pos_threegram]
                else:
                    features['pos-threegram-prob'] = 0
                    
            if i < sent_size - 1:
                pos_bigram = (pos_tags[i][1], pos_tags[i+1][1])
                
                if pos_bigram in self.pos_bigram_dict:
                    features['pos-bigram-prob'] = self.pos_bigram_dict[pos_bigram]
                else:
                    features['pos-bigram-prob'] = 0
            
            
            result.append(features)
            
            
            
            
        return result

In [176]:
pos_ngram_crf_classifier = POSNGramRunOnSentenceCRFClassifier(pos_bigram_dict, pos_thregram_dict)

In [177]:
pos_ngram_crf_classifier.fit(X_train, y_train)

In [178]:
y_pred = pos_ngram_crf_classifier.predict(X_test)

In [179]:
report = Metric.calc_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.98      1.00      0.99    302252
        True       0.76      0.50      0.60      9678

    accuracy                           0.98    311930
   macro avg       0.87      0.75      0.80    311930
weighted avg       0.98      0.98      0.98    311930



In [185]:
eli5.show_weights(pos_ngram_crf_classifier.crf)

From \ To,False,True
False,1.232,-0.5
True,-0.296,-3.934

Weight?,Feature
Weight?,Feature
+6.037,+1:word.lower():.
+5.699,word.lower():as
+4.787,EOS
+4.701,word.lower():mr.
+4.680,word.lower():an
+4.647,"pos-tag:,"
+4.647,"word.lower():,"
+4.329,word.lower():or
+4.229,word.lower():if
+4.055,-1:word.lower():sepulchred

Weight?,Feature
+6.037,+1:word.lower():.
+5.699,word.lower():as
+4.787,EOS
+4.701,word.lower():mr.
+4.680,word.lower():an
+4.647,"pos-tag:,"
+4.647,"word.lower():,"
+4.329,word.lower():or
+4.229,word.lower():if
+4.055,-1:word.lower():sepulchred

Weight?,Feature
+6.165,word.lower():bong
+5.746,+1:word.lower():proprietorships
+4.805,+1:word.lower():venn
+4.695,+1:word.lower():shotgun-type
+4.631,-1:word.lower():whisked
+4.626,+1:word.lower():robby's
+4.615,+1:word.lower():leninism-marxism
+4.531,word.lower():sepulchred
+4.522,+1:word.lower():libertines
+4.501,word.lower():wash.


From the table above, we notice that recently added features (ngram probabilities) do not have significance importancy.

####  Hyperparameter Optimization


In [88]:
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
import scipy

In [89]:
def score_func(y, y_pred):
    return Metric.calc_f1_score_macro_for_class(y, y_pred, class_name='True')

In [90]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = metrics.make_scorer(score_func)



In [91]:
X_features_val = pos_ngram_crf_classifier.sentences2feature_matrix(X_val)

In [92]:
%%time
# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=5,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_features_val, y_val)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed: 27.1min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 35.9min finished


CPU times: user 35min 25s, sys: 24.8 s, total: 35min 50s
Wall time: 36min 4s


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None, gamma=None,
                                 keep_...
                                 trainer_cls=None, variance=None,
                                 verbose=False),
                   iid='deprecated', n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fdb8e0b7210>,
                    

In [93]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.41083349574889616, 'c2': 0.06273667997796518}
best CV score: 0.5256407617462806
model size: 0.51M


### Test the best result on available test corpus

In [180]:
test_file = "../../../tasks/06-language-as-sequence/run-on-test.json"

In [181]:
test_data = load_json(test_file)

In [182]:
X_test_etalon, y_test_etalon = split_dataset_into_X_and_y(test_data)

In [183]:
y_pred_etalon = pos_ngram_crf_classifier.predict(X_test_etalon)

In [184]:
report_etalon = Metric.calc_classification_report(y_test_etalon, y_pred_etalon)
print(report_etalon)

              precision    recall  f1-score   support

       False       0.98      0.99      0.99      4542
        True       0.78      0.52      0.63       155

    accuracy                           0.98      4697
   macro avg       0.88      0.76      0.81      4697
weighted avg       0.98      0.98      0.98      4697



The F1-score for test corpus is 0.63