In [1]:
import sklearn_crfsuite
import json
import pickle
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
import spacy
nlp = spacy.load("en_core_web_md", disable = 'ner')

In [2]:
with open('test_data/run-on-test.json') as f:
    test_data = json.load(f)

with (open("train_labels.pickle", "rb")) as openfile:
    train_labels = pickle.load(openfile)
    
with (open("train_tokens.pickle", "rb")) as openfile:
    train_tokens = pickle.load(openfile)

In [3]:
test_tokens = [[pair[0] for pair in sentence] for sentence in test_data]
test_labels = [[str(pair[1]) for pair in sentence] for sentence in test_data]

test_labels_flat = [item for sublist in test_labels for item in sublist]

In [4]:
def classify_data (train_features_flat, train_labels_flat, test_features_flat):

    vec = DictVectorizer()
    X = vec.fit_transform(train_features_flat)
    clf = LogisticRegression(solver = 'sag',random_state=0).fit(X, train_labels_flat)
    y_pred = clf.predict(vec.transform(test_features_flat))
    return classification_report(test_labels_flat, y_pred, digits =4)


## Baseline

##### За бейзлайн я взяла ознаку написання наступного слова з великої букви. Тобто якшо після поточного токена йде слово з великої букви, то поточний токен - кінець речення.

##### На жаль, використовую логрегресію. Планувала робити класифікацію на CRF, але він повертав мені 0 правильних передбачених випадків для класу True. Очевидно, що я десь помилилася, але не змогла знайти помилку(

In [9]:
def word2features (sent, i):
    features = dict()
    if i == 0:
        features['next_w_is_capitalized'] = 0
    elif i < len(sent) - 1:
        features['next_w_is_capitalized'] = 1 if sent[i + 1][0].isupper() else 0
    else:
        features['next_w_is_capitalized'] = 1
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [10]:
train_features = [sent2features(sent) for sent in train_tokens]
test_features = [sent2features(sent) for sent in test_tokens]

test_features_flat = [item for sublist in test_features for item in sublist]
train_features_flat = [item for sublist in train_features for item in sublist]
train_labels_flat  = [item for sublist in train_labels for item in sublist]

In [11]:
print(classify_data(train_features_flat, train_labels_flat, test_features_flat))


              precision    recall  f1-score   support

       False     0.9810    0.9071    0.9426      4542
        True     0.1509    0.4839    0.2301       155

    accuracy                         0.8931      4697
   macro avg     0.5659    0.6955    0.5863      4697
weighted avg     0.9536    0.8931    0.9191      4697



## Improved

In [12]:
with (open("bigrams-1/bigrams.pickle", "rb")) as openfile:
    word_bigrams = pickle.load(openfile)

with (open("bigrams-1/lemma_bigrams.pickle", "rb")) as openfile:
    lemma_bigrams = pickle.load(openfile)

with (open("bigrams-1/pos_bigrams.pickle", "rb")) as openfile:
    pos_bigrams = pickle.load(openfile)

with (open("bigrams-1/tag_bigrams.pickle", "rb")) as openfile:
    tag_bigrams = pickle.load(openfile)
    
with (open("bigrams-1/dep_bigrams.pickle", "rb")) as openfile:
    dep_bigrams = pickle.load(openfile)

In [18]:
def extract_ling_features (sentence):
    
    pos_tags,lemmas, deps, tags  = [], [], [], []
    sentence = ' '.join(sentence)
    sentence = nlp(sentence)
    
    for token in sentence:
        pos_tags.append(token.pos_)
        lemmas.append(token.lemma_)
        deps.append(token.dep_)
        tags.append(token.tag_)
        
    return (pos_tags, deps, lemmas, tags)

def compare_ngrams (token, next_token, bigram_container):
    
    freq_1 = bigram_container.get((token, next_token), 0)
    freq_2 = bigram_container.get((token, '.'), 0)
    freq_3 = bigram_container.get(('<S>', next_token), 0)
    if (freq_1 > freq_2) and (freq_1 > freq_3):
        return True
    else:
        return False

def word2features (sent, i):
    features = dict()
    
    ling_features = extract_ling_features(sent)
    
    features['pos'] = ling_features[0][i]
    features['dep'] = ling_features[1][i]
    features['lemma'] = ling_features[2][i].lower()
    features['tag'] = ling_features[3][i]
    
    if i == 0:
        
        features['next_w_is_capitalized'] = 0
#         features['word_bigrams'] = compare_ngrams(sent[i].lower(), sent[i+1].lower(), word_bigrams)
#         features['pos_bigrams'] = compare_ngrams(features['pos'], ling_features[0][i+1], pos_bigrams)
#         features['lemma_bigrams'] = compare_ngrams(features['lemma'], ling_features[2][i+1].lower(), lemma_bigrams)
#         features['tag_bigrams'] = compare_ngrams(features['tag'], ling_features[3][i+1], tag_bigrams)
#         features['dep_bigrams'] = compare_ngrams(features['dep'], ling_features[2][i+1], dep_bigrams)

    elif i < len(sent) - 1:
        
        features['next_w_is_capitalized'] = 1 if sent[i + 1][0].isupper() else 0
#         features['word_bigrams'] = compare_ngrams(sent[i].lower(), sent[i+1].lower(), word_bigrams)
#         features['pos_bigrams'] = compare_ngrams(features['pos'], ling_features[0][i+1], pos_bigrams)
#         features['lemma_bigrams'] = compare_ngrams(features['lemma'], ling_features[2][i+1].lower(), lemma_bigrams)
#         features['tag_bigrams'] = compare_ngrams(features['tag'], ling_features[3][i+1], tag_bigrams)
#         features['dep_bigrams'] = compare_ngrams(features['dep'], ling_features[2][i+1], dep_bigrams)
        
    else:
        features['next_w_is_capitalized'] = 1
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

train_features_1 = [sent2features(sent) for sent in train_tokens]
test_features_1 = [sent2features(sent) for sent in test_tokens]

test_features_flat_1 = [item for sublist in test_features_1 for item in sublist]
train_features_flat_1 = [item for sublist in train_features_1 for item in sublist]
train_labels_flat_1  = [item for sublist in train_labels for item in sublist]

#### Мій найкращий результат - комбінація лінгвістичних фіч без нграм

In [117]:
print(classify_data(train_features_flat_1, train_labels_flat_1, test_features_flat_1))


              precision    recall  f1-score   support

       False     0.9815    0.9828    0.9822      4542
        True     0.4765    0.4581    0.4671       155

    accuracy                         0.9655      4697
   macro avg     0.7290    0.7204    0.7246      4697
weighted avg     0.9649    0.9655    0.9652      4697





#### Результат з нграмами різних типів - якість падає на 10%

In [19]:
print(classify_data(train_features_flat_1, train_labels_flat_1, test_features_flat_1))


              precision    recall  f1-score   support

       False     0.9809    0.9478    0.9641      4542
        True     0.2305    0.4581    0.3067       155

    accuracy                         0.9317      4697
   macro avg     0.6057    0.7029    0.6354      4697
weighted avg     0.9561    0.9317    0.9424      4697



