# Dataset from Google Research
One million English sentences, each split into two sentences that together preserve the original meaning, extracted from Wikipedia edits.
https://github.com/google-research-datasets/wiki-split.git

In [2]:
import json
import pandas as pd
import random
import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

data_dir = '../../../../wiki-split/'
train_filename = data_dir + 'train.tsv.zip'
valid_filename = data_dir + 'validation.tsv'
test_filename = data_dir + 'test.tsv'

In [3]:
def read_data(filename):
    out_data = []
    data = pd.read_csv(filename, sep='\t')
    for index, row in data.iterrows():
        out_data.append([s.strip() for s in row[1].split('<::::>')])

    return pd.DataFrame(out_data)

In [4]:
train_data_df = read_data(train_filename)
train_data_df

Unnamed: 0,0,1
0,' '' BDSM is solely based on consensual activi...,The concepts presented by de Sade are not in a...
1,' '' Critics criticized the use of the dispute...,"For example , The Traditional Values Coalition..."
2,"' '' Do Re Mi '' ' is a song by Kurt Cobain , ...",It is believed to be one of the final songs he...
3,"' '' For Robert Price '' docetism '' , togethe...","In one version , as in Marcionism , Christ was..."
4,' '' He was the fourth of the nine children of...,His father was a Presbyterian minister who rai...
...,...,...
989938,` Arta is a village in Djibouti .,It is located in the Arta Region .
989939,` Assa Gaila is a town in Djibouti .,It is located in the Tadjoura region .
989940,` Jackson was linked with actor Gary Pendergas...,They formed Shoot The Moon Productions togethe...
989941,` Umar appointed him to be the judge of Kufah .,He was very young at the time .


In [5]:
valid_data_df = read_data(valid_filename)
valid_data_df

Unnamed: 0,0,1
0,'' A Living Library '' was Sherk 's work that ...,She transformed these spaces for to build educ...
1,"'' All Singing , All Dancing '' is the elevent...",It originally aired on the Fox network in the ...
2,'' Already Gone '' is a mid-tempo ballad set i...,The female narrator describes her life and alw...
3,'' Bafana Bafana '' is a nickname given to the...,It is Zulu and translates literally as '' the ...
4,'' Blah Blah Blah '' is a song by American pop...,"It is the second single from her debut album ,..."
...,...,...
4994,"Zahm Hall , a male dormitory at Notre Dame , i...",The dorm 's chapel is dedicated to St. Albert ...
4995,Zahn was first diagnosed in the late 1990s .,"Thereafter , he became a vocal supporter of th..."
4996,Zeinab Elobeid Yousif ( 1952 -- 19 March 2016 ...,She was the first Sudanese female to be licens...
4997,"Zen Peacemakers have a 34 - acre campus , the ...",In the Untied States affiliates include includ...


In [6]:
test_data_df = read_data(test_filename)
test_data_df

Unnamed: 0,0,1
0,' Eden Black ' was grown from seed in the late...,Under his conditions it produces pitchers that...
1,' Wilson should extend his stint on The Voice ...,Given that they 're pulling out all the stops ...
2,'' '' New York Mining Disaster 1941 '' '' was ...,"It was their second EP and , like their first ..."
3,"'' ADAPTOGENS : Herbs for Strength , Stamina ,...",Contains a detailed monograph on Schisandra ch...
4,'' Aerodynamic '' is an song by Daft Punk .,It is a instrumental particularly well - known...
...,...,...
4994,"Zhang 's grandfather , convinced that Renjie s...",The family arranged a marriage for him with Ya...
4995,Zhu De became the commander of the Eighth Rout...,Agents working under Zhou Enlai set up a headq...
4996,Zile Huma was born into a filmi and musical fa...,She was the youngest of the three children of ...
4997,"Zion 's Hill , also known by its former name H...",It is the first town one reaches after leaving...


# Baseline

In [7]:
test_filename = '../../../tasks/06-language-as-sequence/run-on-test.json'
with open(test_filename) as f:
    test_data = json.load(f)

test_tokens = []
test_classes = []
for sentence in test_data:
    for word in sentence:
        test_tokens.append(word[0])
        test_classes.append(word[1])

In [89]:
' '.join(test_tokens[:50])

'I think the magnitude of a benefit and error rates that were chosen were reasonable They were standard from our learning . Economists on both the left and right broadly agree that the need for stimulative government spending is necessary to prevent a further collapse of the global economic system'

In [90]:
' '.join([str(c) for c in test_classes[:50]])

'False False False False False False False False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False'

### Just predict everything False

In [10]:
print(classification_report(test_classes, [False] * len(test_tokens)))

              precision    recall  f1-score   support

       False       0.97      1.00      0.98      4542
        True       0.00      0.00      0.00       155

    accuracy                           0.97      4697
   macro avg       0.48      0.50      0.49      4697
weighted avg       0.94      0.97      0.95      4697



  'precision', 'predicted', average, warn_for)


# Prepare validation data from sentences

In [11]:
def make_val_list(sentence):
    result = []
    for word in sentence:
        result.append([word, False])

    if word not in ('.', '?', '!'):
        result[-1][1] = True

    return result

val_data_list = []
for i, row in test_data_df.iterrows():
    sentence = row[0].split()
    del sentence[-1]
    sentence1 = row[1].split()
    sentence1[0].lower()

    result = make_val_list(sentence)
    result.extend(make_val_list(sentence1))
    val_data_list.append(result)

In [12]:
val_data_list[0]

[["'", False],
 ['Eden', False],
 ['Black', False],
 ["'", False],
 ['was', False],
 ['grown', False],
 ['from', False],
 ['seed', False],
 ['in', False],
 ['the', False],
 ['late', False],
 ['1980s', False],
 ['by', False],
 ['Stephen', False],
 ['Morley', True],
 ['Under', False],
 ['his', False],
 ['conditions', False],
 ['it', False],
 ['produces', False],
 ['pitchers', False],
 ['that', False],
 ['are', False],
 ['almost', False],
 ['completley', False],
 ['black', False],
 ['.', False]]

In [13]:
for i, row in valid_data_df.iterrows():
    sentence = row[0].split()
    del sentence[-1]
    sentence1 = row[1].split()

    result = make_val_list(sentence)
    result.extend(make_val_list(sentence1))
    val_data_list.append(result)

In [14]:
val_data_list[-1]

[['Zenica', False],
 ['(', False],
 ['Cyrillic', False],
 [':', False],
 ["''", False],
 ['Зеница', False],
 ["''", False],
 [')', False],
 ['is', False],
 ['an', False],
 ['industrial', False],
 ['city', False],
 ['(', False],
 ['the', False],
 ['third', False],
 ['largest', False],
 [',', False],
 ['after', False],
 ['Sarajevo', False],
 ['and', False],
 ['Banja', False],
 ['Luka', False],
 [')', False],
 ['and', False],
 ['municipality', False],
 ['in', False],
 ['Bosnia', False],
 ['and', False],
 ['Herzegovina', True],
 ['It', False],
 ['is', False],
 ['the', False],
 ['capital', False],
 ['of', False],
 ['the', False],
 ['Zenica', False],
 ['-', False],
 ['Doboj', False],
 ['Canton', False],
 ['of', False],
 ['the', False],
 ['Federation', False],
 ['of', False],
 ['Bosnia', False],
 ['and', False],
 ['Herzegovina', False],
 ['entity', False],
 ['.', False]]

In [15]:
random.seed(1)
random.shuffle(val_data_list)

In [16]:
val_data_list[-1]

[['It', False],
 ['is', False],
 ['a', False],
 ['popular', False],
 ['misconception', False],
 ['that', False],
 ['these', False],
 ['lakes', False],
 ['are', False],
 ['filled', False],
 ['via', False],
 ['the', False],
 ['Nepean', False],
 ['River', True],
 ['They', False],
 ['are', False],
 ['not', False],
 [',', False],
 ['they', False],
 ['are', False],
 ['filled', False],
 ['via', False],
 ['rain', False],
 ['water', False],
 ['and', False],
 ['ground', False],
 ['water', False],
 ['.', False]]

In [17]:
val_tokens = []
val_classes = []
for sentence in val_data_list:
    for word in sentence:
        val_tokens.append(word[0])
        val_classes.append(word[1])

In [18]:
print(' '.join(val_tokens[:20]))
print(' '.join([str(c) for c in val_classes[:20]]))

The Institute has planned to increase the PRM batch size by another 60 , from the academic year 2013 -
False False False False False False False False False False False False False False False False False False False False


In [19]:
print(classification_report(val_classes, [False] * len(val_tokens)))

              precision    recall  f1-score   support

       False       0.97      1.00      0.99    350435
        True       0.00      0.00      0.00      9994

    accuracy                           0.97    360429
   macro avg       0.49      0.50      0.49    360429
weighted avg       0.95      0.97      0.96    360429



So our baseline works the same on validation :D

# Prepare train data

In [20]:
train_data_list = []
for i, row in train_data_df.iterrows():
    sentence = row[0].split()
    sentence1 = row[1].split()
    del sentence[-1]

    result = make_val_list(sentence)
    result.extend(make_val_list(sentence1))
    train_data_list.append(result)

In [21]:
train_tokens = []
train_classes = []
for sentence in train_data_list:
    for word in sentence:
        train_tokens.append(word[0])
        train_classes.append(word[1])

In [22]:
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_tokens)
print('Train counts shape:', train_counts.shape)

Train counts shape: (35653401, 533517)


In [23]:
lr = LogisticRegression(random_state=1, solver='sag', multi_class="multinomial", max_iter=100, n_jobs=-1)
scores = cross_val_score(lr, train_counts, train_classes, cv=3, scoring='f1_macro', n_jobs=-1)
print('[cross_val] F1:', sum(scores)/3)

[cross_val] F1: 0.4943167310504299


----------------------------
oh, this is not funny at all

In [24]:
lr.fit(train_counts, train_classes)

val_counts = count_vect.transform(val_tokens)
print('Val counts shape:', val_counts.shape)
y_pred = lr.predict(val_counts)

print(classification_report(val_classes, y_pred))

Val counts shape: (360429, 533517)
              precision    recall  f1-score   support

       False       0.97      1.00      0.99    350435
        True       0.52      0.00      0.00      9994

    accuracy                           0.97    360429
   macro avg       0.75      0.50      0.49    360429
weighted avg       0.96      0.97      0.96    360429



----------------
Мені здається я потроху доходжу до висновку, що мій датасет **надто** відрізняється від того, на котрому відбувається тест. Тому він не вгадує нічого

# Feature Extraction

In [29]:
nlp = spacy.load("en_core_web_md")

In [30]:
def word2features(sent, i):
    word = sent[i][0].text
    postag = sent[i][0].pos_
    lemma = sent[i][0].lemma_

    features = {
        'bias': 1.0,
        'lemma': lemma,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper() and len(word) > 1,
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
    }
    if i > 0:
        word1 = sent[i-1][0].text
        postag1 = sent[i-1][0].pos_
        lemma1 = sent[i-1][0].lemma_
        features.update({
            '-1:word.lower()': lemma1,
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper() and len(word1) > 1,
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0].text
        postag1 = sent[i+1][0].pos_
        lemma1 = sent[i+1][0].lemma_
        features.update({
            '+1:word.lower()': lemma1,
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper() and len(word1) > 1,
            '+1:postag': postag1,
        })
    else:
        features['EOS'] = True

    return features

In [31]:
def make_val_list(sentence):
    result = []
    is_end_found = False
    for token in sentence:
        if not is_end_found and token.text in ('.', '?', '!'):
            result[-1][1] = True
            is_end_found = True
        else:
            result.append([token, False])

    return result

In [32]:
val_temp_data = []
for i, row in valid_data_df.iterrows():
    doc = row[0] + ' ' + row[1]
    val_temp_data.append(doc)

for i, row in test_data_df.iterrows():
    sentence = row[0]
    sentence2_split = row[1].split()
    doc = sentence + ' ' + sentence2_split[0].lower() + ' ' + ' '.join(sentence2_split[1:])
    val_temp_data.append(doc)

val_data = []
for doc in nlp.pipe(val_temp_data):
    val_data.append(make_val_list(doc))

random.seed(1)
random.shuffle(val_data)

val_features = []
val_tokens = []
val_classes = []
for sentence in val_data:
    for i, word in enumerate(sentence):
        val_features.append(word2features(sentence, i))
        val_tokens.append(word[0])
        val_classes.append(word[1])
len(val_features), len(val_tokens), len(val_classes)

(361579, 361579, 361579)

In [33]:
train_temp_data = []
for i, row in train_data_df.iterrows():
    doc = row[0] + ' ' + row[1]
    train_temp_data.append(doc)

for i, row in test_data_df.iterrows():
    sentence = row[0]
    sentence2_split = row[1].split()
    doc = sentence + ' ' + sentence2_split[0].lower() + ' ' + ' '.join(sentence2_split[1:])
    train_temp_data.append(doc)

In [35]:
train_data = []
for doc in nlp.pipe(train_temp_data):
    train_data.append(make_val_list(doc))

train_features = []
train_tokens = []
train_classes = []
for sentence in train_data:
    for i, word in enumerate(sentence):
        train_features.append(word2features(sentence, i))
        train_tokens.append(word[0])
        train_classes.append(word[1])
len(train_features), len(train_tokens), len(train_classes)

(35951066, 35951066, 35951066)

In [36]:
vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)
print("Total number of features: ", len(vec.get_feature_names()))

Total number of features:  1851555


In [37]:
val_features_vectorized = vec.transform(val_features)

In [38]:
train_features_vectorized = vec.transform(train_features)

In [46]:
lrf = LogisticRegression(random_state=1, solver='sag', multi_class="multinomial", max_iter=100, n_jobs=-1)
scores = cross_val_score(lrf, train_features_vectorized[:1000000], train_classes[:1000000], cv=3, scoring='f1_macro', n_jobs=-1)
print('[cross_val] F1:', sum(scores)/3)

[cross_val] F1: 0.9423691593851049


In [52]:
lrf.fit(train_features_vectorized, train_classes)

y_pred = lrf.predict(val_features_vectorized)

print(classification_report(val_classes, y_pred))



              precision    recall  f1-score   support

       False       0.98      1.00      0.99    351581
        True       0.93      0.45      0.61      9998

    accuracy                           0.98    361579
   macro avg       0.96      0.73      0.80    361579
weighted avg       0.98      0.98      0.98    361579



--------------------------
Йой, на валідації це дуже непогано працює

# А тепер на тестових даних

In [112]:
def word2test_features(sent, i):
    word = sent[i].text
    postag = sent[i].pos_
    lemma = sent[i].lemma_

    features = {
        'bias': 1.0,
        'lemma': lemma,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper() and len(word) > 1,
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
    }
    if i > 0:
        word1 = sent[i-1].text
        postag1 = sent[i-1].pos_
        lemma1 = sent[i-1].lemma_
        features.update({
            '-1:word.lower()': lemma1,
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper() and len(word1) > 1,
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1].text
        postag1 = sent[i+1].pos_
        lemma1 = sent[i+1].lemma_
        features.update({
            '+1:word.lower()': lemma1,
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper() and len(word1) > 1,
            '+1:postag': postag1,
        })
    else:
        features['EOS'] = True

    return features

sentences = []
sentences_classes = []

prev_i = 0
for i, token in enumerate(test_tokens):
    if (test_classes[i]) or (token in ('.', '!', '?')):
        sentences.append(' '.join(test_tokens[prev_i:i+1]))
        prev_i = i + 1

def make_test_list(sentence):
    result = []
    for i, token in enumerate(sentence):
        class_ = False
        if (i == len(sentence) - 1) and token.text not in ('.', '!', '?'):
            class_ = True
        result.append([token, class_])

    return result

new_test_features = []
new_test_tokens = []
new_test_classes = []
for sentence in nlp.pipe(sentences):
    for i, word in enumerate(make_test_list(sentence)):
        new_test_features.append(word2test_features(sentence, i))
        new_test_tokens.append(word[0])
        new_test_classes.append(word[1])

len(new_test_features), len(new_test_tokens), len(test_tokens), len(new_test_classes), len(test_classes)

(4735, 4735, 4697, 4735, 4697)

In [113]:
sum(new_test_classes), sum(test_classes)

(155, 155)

In [114]:
test_features_vectorized = vec.transform(new_test_features)

In [115]:
y_pred = lrf.predict(test_features_vectorized)

print(classification_report(new_test_classes, y_pred))

              precision    recall  f1-score   support

       False       0.97      0.99      0.98      4580
        True       0.00      0.00      0.00       155

    accuracy                           0.96      4735
   macro avg       0.48      0.50      0.49      4735
weighted avg       0.94      0.96      0.95      4735



# Висновки

Щоб щось вийшло, треба щоб дані було максимально схожі на ті, котрі треба передбачати :(