In [None]:
import random
import os
import re
import json
import numpy as np
import spacy
import pandas
from functools import reduce
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [None]:
nlp = spacy.load("en_core_web_md")

In [1943]:
def get_all_data(root, results_file_path):
    def sp(text):
        return reduce(lambda acc, el: acc[:-1] + [(acc[-1].strip() if acc else '') + el]
                      if el and re.findall('[\\.\\?\\!]+', el)
                      else acc + [el.strip()], re.split('([\\.\\?\\!]+)(?=\\s)', text),
                      [])

    for root, subdirs, files in os.walk(root):
        res = []
        src_files = [x for x in files if x.endswith('.txt')]
        for file in src_files:
            file_path = os.path.join(root, file)
            with open(file_path) as f:
                content = f.read()

                content = re.sub('\\s{2,}', ' ', content.replace(
                    '\n ', ' ').replace('\n', ' '))
                content = sp(content)
                for sent in content:
                    res.append(sent)
        with open(results_file_path, 'w') as f:
            f.write(json.dumps(res))
    return res


def generate_glued_sents(text):
    tokens = []
    labels = []
    num_sents = random.randint(2, 4)
    text_split = [text[i:i+num_sents] for i in range(0, len(text), num_sents)]

    for spl in text_split:
        sents_spl = spl
        out_of_seq = num_sents
        num_lower = random.randint(0, out_of_seq)
        if num_lower < len(sents_spl):
            sent_to_lower = sents_spl[num_lower]
            s_lower = sent_to_lower[:1].lower() + sent_to_lower[1:]
            sents_spl[num_lower] = s_lower

        for i, sent in enumerate(sents_spl):
            doc = nlp(sent)
                
            eos_syms = ['.', '?', '!', '...']
            for token in doc:
                ln = len(doc)
                if token.i == ln - 2 and token.i + 1 < ln and doc[token.i + 1].text in eos_syms:
                    labels.append(True)
                    tokens.append(token)
                else:
                    if token.text not in eos_syms:
                        labels.append(False)
                        tokens.append(token)

    return tokens, labels


def get_glued_data(train_data, results_file_path):
    tokens, labels = generate_glued_sents(train_data)
    with open(results_file_path, 'w') as f:
        f.write(json.dumps([x.text for x in tokens]))
    return tokens, labels


def get_train_data(extracted_data_file, train_data_file):
    with open(extracted_data_file) as f:
        extracted_data = json.load(f)
    return get_glued_data(extracted_data, train_data_file)


def get_test_data(text):
    sents = [x for sub in text for x in sub]
    test_tokens = [nlp(x[0])[0] for x in sents]
    test_labels = [x[1] for x in sents]

    return test_tokens, test_labels

In [None]:
# DEBUG
def split_unique_words_into_chunks(filename):
    SIZE = 500
    with open(f'{filename}.txt') as f:
        lines = f.readlines()
        ln = len(lines)
        for i in range(1, int(ln/SIZE) + 2):
            end = i*SIZE
            if end > ln:
                start = (i - 1)*SIZE + 1
                chunk = lines[start:]

            else:
                start = (i - 1)*SIZE + 1 if i > 1 else 0
                chunk = lines[start:end]
            with open(f'./{filename}_{i}.txt', 'w') as ch:
                ch.writelines(chunk)


def get_words_for_ngrams(train_tokens):
    with open(test_data_file) as f:
        test_data = json.load(f)

    syms = ['.', '?', '!', '...', ',', ':', ';',
            '-', '>', '<', '&', '(', '=', '/', '\\', '[', '{']

    words = [x[0] for sub in test_data for x in sub]
    with open(unique_words_file, 'a') as f:
        for i, token in enumerate(train_tokens):
            if token not in words and token not in syms:
                words.append(token)
                f.write(token + '\n')
            print('>>>>>', i)


def merge_gnrams_from_chuks(num_chunks):
    res = []
    for i in range(1, num_chunks):
        with open(f'./trigrams_{i}.txt') as f:
            content = f.readlines()
            res += content
    with open(f'./trigrams.txt', 'w') as f:
        f.writelines(res)


def tsv_to_json(content):
    res = []
    for line in content:
        if line.startswith('{"error"'):
            continue
        obj = {'tks': []}
        parts = line.split()
        for x in [parts[0], parts[1], parts[2]]:
            r = re.split('_(\\d)', x)
            obj['tks'].append({'tt': r[0], 'tg': r[1]})
        obj['mc'] = parts[3]
        obj['vc'] = parts[4]
        obj['sc'] = parts[8]
        res.append(obj)
    return res


In [1879]:
def get_classifier():
    pipe = Pipeline([
        ('dict_vect', DictVectorizer()),
        ('lrc', LogisticRegression(random_state=42, multi_class='multinomial',
                                   max_iter=100, solver='sag', n_jobs=50))])

    return pipe


def check_freq_in_ngrams(tokens, i, ngrams):
    token = tokens[i]
    next_token = tokens[i + 1] if i + 1 < len(tokens) else None
    
    word_freq = 0.00
    next_pos = 'NONE'

    for i, ngram in enumerate(ngrams):
        tks = ngram['tks']
        for i, t in enumerate(tks):
            if t['tg'] == 0 and t['tt'] == token:
                if tks[i + 1]['tt'] == next_token:
                    word_freq = ngram['sc']
                    if next_token:
                        next_pos = next_token.pos_
        return word_freq, next_pos


""" start feature extractors """


def word_feature_extractor(tokens, i):
    token = tokens[i]
    features = {}
    features['word'] = token.text
    return features


def adj_words_feature_extractor(tokens, i):
    token = tokens[i]
    features = {}

    tk_len = len(tokens)

    features['word-1'] = tokens[i - 1].text if i > 1 else '<S>'
    features['word-2'] = tokens[i - 2].text if i > 2 else '<S>'
    features['word+1'] = tokens[i + 1].text if i + 1 < tk_len else '</S>'
    features['word+2'] = tokens[i + 2].text if i + 2 < tk_len else '</S>'
    return features


def pos_feature_extractor(tokens, i):
    token = tokens[i]
    features = {}

    features['POS'] = token.pos_
    return features


def adj_pos_feature_extractor(tokens, i):
    token = tokens[i]
    features = {}

    tk_len = len(tokens)

    features['POS-1'] = tokens[i - 1].pos_ if i > 1 else 'NONE'
    features['POS-2'] = tokens[i - 2].pos_ if i > 2 else 'NONE'
    features['POS+1'] = tokens[i + 1].pos_ if i + 1 < tk_len else 'NONE'
    features['POS+2'] = tokens[i + 2].pos_ if i + 2 < tk_len else 'NONE'
    return features


def shape_feature_extractor(tokens, i):
    token = tokens[i]

    def get_shape(w):
        if w.isupper():
            return 'is_upper'
        elif w.istitle():
            return 'is_title'
        elif w.islower():
            return 'is_lower'
        elif w.isdigit():
            return 'is_digit'
        elif w.isalpha():
            return 'is_alpha'
        else:
            return 'other'

    features = {}
    
    tk_len = len(tokens)

    features['shape'] = get_shape(token.text)
    features['shape-1'] = get_shape(tokens[i - 1].text) if i > 1 else 'other'
    features['shape-2'] = get_shape(tokens[i - 2].text) if i > 2 else 'other'
    features['shape+1'] = get_shape(tokens[i + 1].text) if i + 1 < tk_len else 'other'
    features['shape+2'] = get_shape(tokens[i + 2].text) if i + 2 < tk_len else 'other'
    return features


def ngrams_feature_extractor(ngrams):
    def inner(tokens, i):
        token = tokens[i]
        features = {}
        word_freq, next_pos = check_freq_in_ngrams(tokens, i, ngrams)
        features['freq_in_ngrams'] = word_freq
        features['next_pos_in_ngrams'] = next_pos
        return features
    return inner


""" end feature extractors """


def get_features(tokens, extractors):
    features = []
    seen_features = {}

    for i in range(len(tokens)):
        curr_token = tokens[i]
        if curr_token not in seen_features.keys():
            feat = {}
            for extractor in extractors:
                feat.update(extractor(tokens, i))
            seen_features[curr_token] = feat
            features.append(feat)
        else:
            features.append(seen_features[curr_token])
    return features


def get_cross_validation_report(clf, X_train, y_train):
    scoring = {'accuracy': make_scorer(accuracy_score),
               'precision_true': make_scorer(precision_score, average=None, labels=[True]),
               'precision_false': make_scorer(precision_score, average=None, labels=[False]),
               'precision_macro': make_scorer(precision_score, average='macro'),
               'precision_weighted': make_scorer(precision_score, average='weighted'),
               'recall_true': make_scorer(recall_score, average=None, labels=[True]),
               'recall_false': make_scorer(recall_score, average=None, labels=[False]),
               'recall_macro': make_scorer(recall_score, average='macro'),
               'recall_weighted': make_scorer(recall_score, average='weighted'),
               'f1_true': make_scorer(f1_score, average=None, labels=[True]),
               'f1_false': make_scorer(f1_score, average=None, labels=[False]),
               'f1_macro': make_scorer(f1_score, average='macro'),
               'f1_weighted': make_scorer(f1_score, average='weighted'),
               }
    res = cross_validate(clf, X_train, y_train, scoring=scoring)

    def get_score(field):
        return round(res[field].mean(), 2)

    accuracy = get_score('test_accuracy')
    precision_false = get_score('test_precision_false')
    precision_true = get_score('test_precision_true')
    recall_false = get_score('test_recall_false')
    recall_true = get_score('test_recall_true')
    f1_false = get_score('test_f1_false')
    f1_true = get_score('test_f1_true')
    precision_macro = get_score('test_precision_macro')
    precision_weighted = get_score('test_precision_weighted')
    recall_macro = get_score('test_recall_macro')
    recall_weighted = get_score('test_recall_weighted')
    f1_macro = get_score('test_f1_macro')
    f1_weighted = get_score('test_f1_weighted')

    scores = ['precision', 'recall', 'f1-score']
    labels = ['False', 'True', '', 'accuracy', 'macro avg', 'weighted avg']

    data = np.array([[precision_false, recall_false, f1_false],
                     [precision_true, recall_true, f1_true],
                     ['', '', ''],
                     ['', '', accuracy],
                     [precision_macro, recall_macro, f1_macro],
                     [precision_weighted, recall_weighted, f1_weighted],
                     ])
    print(pandas.DataFrame(data, labels, scores))
    
    
def get_cross_validation_result(clf, train_tokens, feature_extractors):
    train_features = get_features(train_tokens, feature_extractors)
    return get_cross_validation_report(clf, train_features, train_labels)

In [1944]:
extracted_data_file= './extracted_data.json'
train_data_file = './train_data_glued.json'
train_tokens, train_labels = get_train_data(extracted_data_file, train_data_file)

In [1867]:
with open('../../../tasks/06-language-as-sequence/run-on-test.json') as f:
    test_content = json.load(f)
    test_tokens, test_labels = get_test_data(test_content)

In [None]:
with open('./trigrams.json') as f:
    ngrams = json.load(f)

In [1946]:
clf = get_classifier()

### 1. Проста фіча - текст слова

In [1948]:
feature_extractors = [word_feature_extractor]
get_cross_validation_result(clf, train_tokens, feature_extractors)

             precision recall f1-score
False             0.96    1.0     0.98
True              0.75   0.05      0.1
                                      
accuracy                          0.96
macro avg         0.85   0.53     0.54
weighted avg      0.95   0.96     0.94


### 2. Сусідні (по два вправо і вліво) слова

In [1949]:
feature_extractors.append(adj_words_feature_extractor)
get_cross_validation_result(clf, train_tokens, feature_extractors)

             precision recall f1-score
False             0.98    1.0     0.99
True              0.88   0.62     0.73
                                      
accuracy                          0.98
macro avg         0.93   0.81     0.86
weighted avg      0.98   0.98     0.98


### 3. Частина мови

In [1950]:
feature_extractors.append(pos_feature_extractor)
get_cross_validation_result(clf, train_tokens, feature_extractors)

             precision recall f1-score
False             0.98    1.0     0.99
True              0.88   0.62     0.73
                                      
accuracy                          0.98
macro avg         0.93   0.81     0.86
weighted avg      0.98   0.98     0.98


### 4. Частини мови сусідніх (по два вправо і вліво) слів

In [1951]:
feature_extractors.append(adj_pos_feature_extractor)
get_cross_validation_result(clf, train_tokens, feature_extractors)

             precision recall f1-score
False             0.98    1.0     0.99
True              0.88   0.62     0.73
                                      
accuracy                          0.98
macro avg         0.93   0.81     0.86
weighted avg      0.98   0.98     0.98


### 5. Форма слова (+ форми слова сусідніх слів)

In [1953]:
feature_extractors.append(shape_feature_extractor)
get_cross_validation_result(clf, train_tokens, feature_extractors)

             precision recall f1-score
False             0.99    1.0     0.99
True              0.87   0.71     0.78
                                      
accuracy                          0.98
macro avg         0.93   0.85     0.89
weighted avg      0.98   0.98     0.98


### 6. н-грами (частота вживання слів поруч + наступна частина мови)

In [1954]:
feature_extractors.append(ngrams_feature_extractor(ngrams))
get_cross_validation_result(clf, train_tokens, feature_extractors)

             precision recall f1-score
False             0.99    1.0     0.99
True              0.87   0.71     0.78
                                      
accuracy                          0.98
macro avg         0.93   0.85     0.89
weighted avg      0.98   0.98     0.98


### 7. Тестова вибірка

In [1945]:
vect = DictVectorizer()
train_features = get_features(train_tokens, feature_extractors)
train_feat_vectorized = vect.fit_transform(train_features)
clf = LogisticRegression(random_state=42, multi_class='multinomial',
                         max_iter=100, solver='sag', n_jobs=50)
clf.fit(train_feat_vectorized, train_labels)
test_features = get_features(test_tokens, feature_extractors)
print(classification_report(test_labels, clf.predict(vect.transform(test_features))))

              precision    recall  f1-score   support

       False       0.97      0.99      0.98      4542
        True       0.37      0.26      0.30       155

    accuracy                           0.96      4697
   macro avg       0.67      0.62      0.64      4697
weighted avg       0.95      0.96      0.96      4697

