In [1]:
import numpy as np
import sklearn_crfsuite
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


## Data

Please see *generate-dataset.ipynb* to get information how dataset is generated. 

### Load dataset

In [2]:
import json

In [3]:
def load_json(file):
    with open(file) as json_file:
        data = json.load(json_file)
    
    return data

In [4]:
dataset_file = "data/dataset.json"

In [5]:
dataset = load_json(dataset_file)

In [6]:
len(dataset)

11683

In [7]:
dataset[0]

[['Proposal', False],
 ['for', False],
 ['a', False],
 ['Red', False],
 ['Harvester', False],
 ['Ant', False],
 ['Robot', False],
 [':', False],
 ['Colony', False],
 ['Infiltration', False],
 ['and', False],
 ['Task', False],
 ['Switching', False]]

In [8]:
def split_dataset_into_X_and_y(dataset):
    X = [np.array(sent)[:, 0].tolist() for sent in dataset]
    y = [np.array(sent)[:, 1].tolist() for sent in dataset]

    return X, y

In [158]:
X, y = split_dataset_into_X_and_y(dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### N-Gram

In [11]:
def read_file(file):
    result = []
    with open(file) as f:
        lines = f.readlines()
        for line in lines:
            l = line.strip()
            if len(l) > 0:
                result.append(l)
    return result

In [12]:
def read_n_gram_file(ngram_file):
    result = dict()
    lines = read_file(ngram_file)
    for line in lines:
        line_split = line.split('\t')
        key = ' '.join(line_split[1:])
        result[key] = line_split[0]
        
    return result

Here, we use ngrams downloaded from here: https://www.ngrams.info/download_coca.asp

In [13]:
bi_gram_dict = read_n_gram_file("data/w2.txt")
three_gram_dict = read_n_gram_file("data/w3.txt")
four_gram_dict = read_n_gram_file("data/w4.txt")
five_gram_dict = read_n_gram_file("data/w5.txt")

In [14]:
len(bi_gram_dict)

1055386

In [15]:
ngrams_dict_list = [bi_gram_dict, three_gram_dict, four_gram_dict, five_gram_dict]

In [16]:
import itertools
dict(itertools.islice(three_gram_dict.items(), 10))

{'a B.A. degree': '48',
 'a B.A. in': '56',
 'a B.S. in': '41',
 'a BA in': '33',
 'a babble of': '28',
 'a babe in': '31',
 'a baby and': '308',
 'a baby at': '72',
 'a baby before': '29',
 'a baby bird': '50'}

## Classification

### Metric

The main metric for this task is F1-score.

In [38]:
class Metric:
    
    @staticmethod
    def calc_classification_report(y_true, y_pred):
        assert len(y_true) == len(y_pred)

        y_pred_flatten, y_true_flatten = [], []
        for i in range(0, len(y_true)):
            y_true_flatten += y_true[i]
            y_pred_flatten += y_pred[i]
        
        assert len(y_true_flatten) == len(y_pred_flatten)
        
        return classification_report(y_true_flatten, y_pred_flatten)
    

## Baseline classifier

In [171]:
class CRFClassifier:
    def __init__(self):
        self._crf = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=1000,
            all_possible_transitions=False)        
        
        
    def fit(self, X, y):
        x_train = self.sentences2feature_matrix(X)
        self._crf.fit(x_train, y)
        
    
    def predict(self, X):
        x_test = self.sentences2feature_matrix(X)
        return self._crf.predict(x_test)

        
        
    def sentences2feature_matrix(self, X_sentences):
        return [self.sent2features(s) for s in X_sentences]
        
    def sent2features(self, sent):
        return [self.word2features(sent, i) for i in range(len(sent))]
    
    def word2features(self, sent, pos):
        word = sent[pos]

        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
        }
        return features

In [172]:
crf_classifier = CRFClassifier()

In [173]:
crf_classifier.fit(X_train, y_train)

In [174]:
y_pred = crf_classifier.predict(X_test)

In [175]:
report = calc_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.97      1.00      0.99     32994
        True       0.42      0.04      0.07       919

    accuracy                           0.97     33913
   macro avg       0.70      0.52      0.53     33913
weighted avg       0.96      0.97      0.96     33913



## Improve baseline classifier

In [176]:
class RunOnSentenceCRFClassifier(CRFClassifier):
    def __init__(self):
        super(RunOnSentenceCRFClassifier, self).__init__()
        
    def word2features(self, sent, pos):
        word = sent[pos]

        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
        }
        if pos > 0:
            pass
            word1 = sent[pos-1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.istitle()': word1.istitle(),
                '-1:word.isupper()': word1.isupper(),
            })
        else:
            features['BOS'] = True

        if pos < len(sent)-1:
            word1 = sent[pos+1]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
            })
        else:
            features['EOS'] = True
            
        return features
 

In [177]:
run_on_sent_crf_classifier = RunOnSentenceCRFClassifier()

In [178]:
run_on_sent_crf_classifier.fit(X_train, y_train)

In [179]:
y_pred = run_on_sent_crf_classifier.predict(X_test)

In [180]:
report = calc_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.99      1.00      1.00     32994
        True       0.93      0.79      0.85       919

    accuracy                           0.99     33913
   macro avg       0.96      0.89      0.92     33913
weighted avg       0.99      0.99      0.99     33913



As we can see, adding featurAs we can see, adding features for next and previous words improved F1-score for True class significantly.es for next and previous words improved F1-socre for True class significantlly. 

#### Add n-gram occurrences as features

In [205]:
class NGramRunOnSentenceCRFClassifier(RunOnSentenceCRFClassifier):
    def __init__(self, ngrams_dict_list):
        super(NGramRunOnSentenceCRFClassifier, self).__init__()
        self.ngrams_dict_list = ngrams_dict_list
        
        
    def sent2features(self, sent):
        result = []
        for i in range(0, len(sent)):
            features = self.word2features(sent, i)
            self.__update_features_with_nagram(features, sent, i)
            if i > 0:
                self.__update_features_with_nagram(features, sent, i - 1, suffix="-1")
            
            if i < len(sent) - 1:
                self.__update_features_with_nagram(features, sent, i + 1, suffix="+1")
                
            result.append(features)
            
        return result
                
    def __update_features_with_nagram(self, features, sent, pos, suffix=""):
        i = pos
        while i < len(sent) - 1 and i - pos < len(self.ngrams_dict_list):
            ngram = ' '.join(sent[pos: pos + i + 1])


            n = i - pos + 2
            if ngram in self.ngrams_dict_list[i - pos]:
#                 print(ngram, self.ngrams_dict_list[i - pos][ngram])
                features[f"{n}-gram-{suffix}"] = int(self.ngrams_dict_list[i - pos][ngram])
#             else:
#                 features[f"{n}-gram"] = 0

            i += 1

In [206]:
ngram_crf_classifier = NGramRunOnSentenceCRFClassifier(ngrams_dict_list)

In [207]:
ngram_crf_classifier.fit(X_train, y_train)

In [208]:
y_pred = ngram_crf_classifier.predict(X_test)

In [209]:
report = Metric.calc_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.97      1.00      0.99     32994
        True       0.00      0.00      0.00       919

    accuracy                           0.97     33913
   macro avg       0.49      0.50      0.49     33913
weighted avg       0.95      0.97      0.96     33913



As we can see addinfg ngram occurences as a features did not improved accuracy, which was not expected. Probably there is sentences which has end of word where n-gram exist in this dictionary.

#### Add part of speech as features

In [213]:
import nltk

In [240]:
class POSRunOnSentenceCRFClassifier(RunOnSentenceCRFClassifier):
    def __init__(self):
        super(POSRunOnSentenceCRFClassifier, self).__init__()
        
    def sent2features(self, sent):
        result = []
        pos_tags = nltk.pos_tag(sent)
        assert len(pos_tags) == len(sent)
        for i in range(0, len(sent)):
            features = self.word2features(sent, i)
            features['pos-tag'] = pos_tags[i][1]
            result.append(features)
            
        return result


In [241]:
pos_tag_crf_classifier = POSRunOnSentenceCRFClassifier()

In [242]:
pos_tag_crf_classifier.fit(X_train, y_train)

In [243]:
y_pred = pos_tag_crf_classifier.predict(X_test)

In [244]:
report = Metric.calc_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00     32994
        True       0.92      0.83      0.87       919

    accuracy                           0.99     33913
   macro avg       0.96      0.92      0.94     33913
weighted avg       0.99      0.99      0.99     33913



#### Fine-tune crf 

In [254]:
#TOOD: provide 

### Test the best result on available test corpus

In [245]:
test_file = "../../../tasks/06-language-as-sequence/run-on-test.json"

In [246]:
test_data = load_json(test_file)

In [247]:
X_test_etalon, y_test_etalon = split_dataset_into_X_and_y(test_data)

In [252]:
y_pred_etalon = run_on_sent_crf_classifier.predict(X_test_etalon)

In [253]:
report_etalon = calc_classification_report(y_test_etalon, y_pred_etalon)
print(report_etalon)

              precision    recall  f1-score   support

       False       0.98      1.00      0.99      4542
        True       0.80      0.41      0.54       155

    accuracy                           0.98      4697
   macro avg       0.89      0.70      0.76      4697
weighted avg       0.97      0.98      0.97      4697

