In [2]:
!curl https://raw.githubusercontent.com/vseloved/prj-nlp-2019/master/tasks/07-language-as-sequence/run-on-test.json --output run-on-test.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  273k  100  273k    0     0   492k      0 --:--:-- --:--:-- --:--:--  492k


In [5]:
import json

with open('run-on-test.json') as f:
    run_on_js = json.load(f)

In [4]:
import random

def rand_run_on_num():
    x = random.random()
    if x < 0.25:
        return 0
    elif 0.25 <= x < 0.97:
        return 1
    
    return 2

In [257]:
from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.corpus import gutenberg
from nltk.corpus import abc
from nltk.corpus import treebank
from nltk.corpus import conll2000

source = []

for fileid in reuters.fileids():
    source.append(reuters.sents(fileid))
    
for fileid in gutenberg.fileids():
    source.append(gutenberg.sents(fileid))    
    
for fileid in brown.fileids():
    source.append(brown.sents(fileid))    
    
for fileid in abc.fileids():
    source.append(abc.sents(fileid))        
    
for fileid in treebank.fileids():
    source.append(treebank.sents(fileid))
    
for fileid in conll2000.fileids():
    corpus.append(conll2000.sents(fileid))     

In [3]:
from collections import defaultdict
import nltk

In [6]:
import spacy

nlp = spacy.load('en_core_web_lg', disable = ['ner', 'parser'])

In [258]:
from collections import namedtuple

Token = namedtuple('Token', 'text point_after pos tag lemma')

random.seed(273757)

corpus = []

def tag_tokens(tokens):   
    
    doc = nlp(' '.join(tokens))
    
    return [Token(token.text, False, token.pos_, token.tag_, token.lemma_, token.dep_) for token in doc]

with open('run-on-corpus.txt', 'w') as f_corpus:
    for sents in source:        
        run_on_num = rand_run_on_num()
        buff = []
        prev_offset = 0   
        for sent in sents[1:]:
            if len(sent) < 5:
                continue
                
            exclude = False
            for word in sent:
                if word == '?' or word == ';' or word == '...':
                    exclude = True
                    break
            if exclude:
                continue
                
            tokens = tag_tokens(sent)
            buff.extend(tokens)          
                
            if prev_offset > 0:                
                if buff[prev_offset - 1].text == '.':
                    del buff[prev_offset - 1]
                    buff[prev_offset - 2] = buff[prev_offset - 2]._replace(point_after = True)                
                   
                    if not buff[prev_offset - 1].text.isupper():                
                        lwr = random.random()                
                        if lwr < 0.93:
                            buff[prev_offset - 1] = buff[prev_offset -1]._replace(text = buff[prev_offset - 1].text.lower())                                            
            
            prev_offset = len(buff)            
            
            if run_on_num == 0:
                print(buff, file = f_corpus, end = "\n")
                corpus.append(buff)
                run_on_num = rand_run_on_num()
                prev_offset = 0
                buff = []                               
                       
            run_on_num -= 1

In [259]:
len(corpus)

18701

In [260]:
for i in range(10):
    random.shuffle(corpus)

train_index = int(0.7 * len(corpus))

train_data = corpus[: train_index]
test_data = corpus[train_index: ]

In [261]:
bigrams = defaultdict(float)
trigrams = defaultdict(float)

all_tokens = [token for sent in train_data for token in map(lambda x: x.text.lower(),sent)]

for g in nltk.ngrams(all_tokens, 2):        
    bigrams[g] += 1.0
                
for g in nltk.ngrams(all_tokens, 3):    
    trigrams[g] += 1.0       

In [262]:
def update_freq(ngrams):
    n = sum(ngrams.values())
    for k, v in ngrams.items():
        ngrams[k] = v/n

In [263]:
update_freq(bigrams)
update_freq(trigrams)

In [292]:
sorted(bigrams.items(), key=lambda kv: kv[1], reverse = True)

[(('.', 'the'), 0.00598490423501983),
 (('of', 'the'), 0.0055116691522362525),
 (("'", 's'), 0.00521861072941156),
 (('in', 'the'), 0.004758400465420191),
 (('said', '.'), 0.003377769673446085),
 ((',', 'the'), 0.003186738997827026),
 (('said', 'the'), 0.0029913667159438976),
 ((',', '000'), 0.0029544630626993067),
 (('mln', 'dlrs'), 0.0028524353154936732),
 (('u', '.'), 0.0026852834743269966),
 (('the', 'company'), 0.002631013396026128),
 (('.', '"'), 0.002572401711461189),
 (('.', 's'), 0.002500765208104042),
 (('s', '.'), 0.002405249870294513),
 ((',', '"'), 0.002403079067162478),
 (('for', 'the'), 0.0021121914474698206),
 (('to', 'the'), 0.001901623543662449),
 ((',', 'and'), 0.0018191330246451279),
 (('said', 'it'), 0.0017995957964568152),
 (('1', '.'), 0.0017995957964568152),
 (('it', 'said'), 0.001602052711441652),
 ((',', 'which'), 0.0015955403020455478),
 (('he', 'said'), 0.0014804877360477056),
 (('on', 'the'), 0.0014435840828031148),
 (('will', 'be'), 0.0013806307919741066),

In [265]:
import math

def baseline(data):
    result_data = []
    for sent in data:
        result_sent = []
        last_point = 0
        for i, word in enumerate(sent):            
            if (i - last_point) > 3 and i < (len(sent) - 1):
                pbigram = (word[0].lower(), '.')                
                bigram = (word[0].lower(), sent[i+1][0].lower())                                    
                if (sent[i + 1][0][0:1].isupper()):
                    result_sent.append([word[0], True])
                elif math.log(bigrams[pbigram] + 0.000000000001) > math.log(bigrams[bigram] + 0.001):    
                    #if not word[1]:
                         #print('false positive', pbigram, bigram, bigrams[pbigram], bigrams[bigram], word, sent[i + 1])
                    result_sent.append([word[0], True])
                    last_point = i
                else:   
                    #if word[1]:
                    #     print('false negative', pbigram, bigram, bigrams[pbigram], bigrams[bigram], word, sent[i + 1])
                    result_sent.append([word[0], False])                                    
            else:
                #if word[1]:
                #    print('false negative', word, sent[i + 1])
                result_sent.append([word[0], False])
            
        result_data.append(result_sent)   
    
    return result_data

In [266]:
result = baseline(test_data)

In [267]:
def labels_vec(data):
    return [word[1] for sent in data for word in sent]

In [268]:
from sklearn.metrics import classification_report


print(classification_report(labels_vec(test_data), labels_vec(result)))

              precision    recall  f1-score   support

       False       0.99      0.91      0.95    196267
        True       0.02      0.18      0.03      1655

   micro avg       0.91      0.91      0.91    197922
   macro avg       0.51      0.55      0.49    197922
weighted avg       0.98      0.91      0.94    197922



In [269]:
print(classification_report(labels_vec(run_on_js), labels_vec(baseline(run_on_js))))

              precision    recall  f1-score   support

       False       0.98      0.96      0.97      4542
        True       0.26      0.45      0.33       155

   micro avg       0.94      0.94      0.94      4697
   macro avg       0.62      0.70      0.65      4697
weighted avg       0.96      0.94      0.95      4697



In [436]:
import sys

def extract_sent_features(sent):
    features = []
    for i, token in enumerate(sent):
        fdic = {}        
        fdic['next_case'] = sent[i+1].text[:1].isupper() if i < (len(sent) - 1) else False        
        fdic['case'] = token.text[:1].isupper()
        
        fdic['tag'] = token.tag
        fdic['tag_prev'] = 'NONE_TAG' if i == 0 else str(sent[i-1].tag)
        fdic['tag_next'] = 'NONE_TAG' if i == (len(sent) - 1) else str(sent[i+1].tag)        
        
        fdic['index1'] = len(sent)/(i+1)
        fdic['index2'] = len(sent)/(len(sent) - i+1)
        
        
        fdic['word'] = token.text
        fdic['prev_word'] = '' if i == 0 else sent[i-1].text
        fdic['next_word'] = '' if i == (len(sent) - 1) else sent[i+1].text        
        
        fdic['bigram'] = math.log((0.0 if i == (len(sent) - 1) else bigrams[(token.text.lower(), sent[i+1].text.lower())]) +  0.000001)
        fdic['pbigram'] = math.log((0.0 if i == (len(sent) - 1) else bigrams[(token.text.lower(), '.')]) +  0.000001)
        fdic['pbigram2'] = math.log((0.0 if i == (len(sent) - 1) else bigrams[('.', sent[i+1].text.lower())]) +  0.000001)      
        fdic['ptrigram'] = math.log((0.0 if i == (len(sent) - 1) else trigrams[(token.text.lower(), '.', sent[i+1].text.lower())]) +  0.000001)
        fdic['ptrigram2'] = math.log((0.0 if i > (len(sent) - 3) else trigrams[('.', sent[i+1].text.lower(), sent[i+2].text.lower())]) +  0.000001)
        fdic['ptrigram3'] = math.log((trigrams[(sent[i-1].text.lower(), token.text.lower(), '.')] if i > 0 else 0.0) +  0.000001)   
        
        features.append(fdic)
    
    return features

def extract_features(sents):
    features = []
    for sent in sents:
        features.extend(extract_sent_features(sent))
    return features 
    

In [437]:
features = extract_features(train_data)

In [438]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()

vectorizer.fit(features)

feature_vecs = vectorizer.transform(features)

In [439]:
from sklearn.linear_model import LogisticRegression 

#logreg = LogisticRegression(random_state=26,  solver='lbfgs', multi_class="multinomial", max_iter=2000)
logreg = LogisticRegression(random_state=26, max_iter=3000)

logreg.fit(feature_vecs, labels_vec(train_data))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=26, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [440]:
def predict(data):
    vec = vectorizer.transform(extract_features(data))
    return logreg.predict(vec)

In [441]:
test_predicted = predict(test_data)

In [442]:
print(classification_report(labels_vec(test_data), test_predicted))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00    196267
        True       0.68      0.62      0.65      1655

   micro avg       0.99      0.99      0.99    197922
   macro avg       0.84      0.81      0.82    197922
weighted avg       0.99      0.99      0.99    197922



In [443]:
def convert_to_tokens(data):
    tagged_sents = []
    
    for sent in data:
        tagged_sent = [] 
        tokens = tag_tokens(list(map(lambda w: w[0], sent)))
        compound = None
        merged = 0
        i = 0
        j = 0
        while i < len(tokens) or j < len(sent): 
            orig_token = sent[i] if i < len(sent) else sent[len(sent) - 1]
            token = tokens[j] if j < len(tokens) else tokens[len(tokens) - 1]           
            if orig_token[0] == token.text:                
                tagged_sent.append(token._replace(point_after = orig_token[1]))
                j += 1
                i += 1
            elif orig_token[0] in token.text:                
                if not compound:
                    compound = token
                else:
                    compound = compound._replace(point_after = orig_token[1])
                merged += len(orig_token[0])    
                if merged == len(compound.text):                     
                    j += 1
                    tagged_sent.append(compound)
                    compound = None                
                    merged = 0
                i += 1    
            elif token.text in orig_token[0]:                
                if not compound:
                    compound = orig_token[0]
                    
                merged += len(token.text)
                if merged == len(compound): 
                    i += 1
                    tagged_sent.append(token._replace(point_after = orig_token[1]))                    
                    compound = None
                    merged = 0
                else:
                    tagged_sent.append(token)
                j += 1
            else:                
                print(orig_token, token.text)
                print(list(map(lambda w: w[0], tagged_sent)))
                print(list(map(lambda w: w[0], sent)))
                assert False                                    
        
        tagged_sents.append(tagged_sent)
    
    return tagged_sents

run_on_js_tokenized = convert_to_tokens(run_on_js)

In [444]:
print(sum([w[1] for sent in run_on_js for w in sent]))
print(sum([t.point_after for sent in run_on_js_tokenized for t in sent]))

155
155


In [445]:
print(classification_report(labels_vec(run_on_js_tokenized), predict(run_on_js_tokenized)))

              precision    recall  f1-score   support

       False       0.98      1.00      0.99      4618
        True       0.74      0.37      0.50       155

   micro avg       0.98      0.98      0.98      4773
   macro avg       0.86      0.68      0.74      4773
weighted avg       0.97      0.98      0.97      4773



In [446]:
sum( predict(run_on_js_tokenized))

78