In [1]:
#!curl https://raw.githubusercontent.com/vseloved/prj-nlp-2019/master/tasks/07-language-as-sequence/run-on-test.json --output run-on-test.json

In [2]:
import json

with open('run-on-test.json') as f:
    run_on_js = json.load(f)

In [3]:
import random

def rand_run_on_num():
    x = random.random()
    if x < 0.25:
        return 0
    elif 0.25 <= x < 0.95:
        return 1
    
    return 2

In [4]:
from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.corpus import gutenberg
#from nltk.corpus import webtext
from nltk.corpus import abc
from nltk.corpus import treebank
#from nltk.corpus import dependency_treebank
#from nltk.corpus import conll2000


source = []

for fileid in reuters.fileids():
    source.append(reuters.sents(fileid))
    
#for fileid in webtext.fileids():
#    source.append(webtext.sents(fileid))
    
#for fileid in gutenberg.fileids():
#    source.append(gutenberg.sents(fileid))    
    
#for fileid in abc.fileids():
#    source.append(abc.sents(fileid))       
    
#for fileid in brown.fileids():
#    source.append(brown.sents(fileid)) 
    
#for fileid in dependency_treebank.fileids():
#    source.append(dependency_treebank.sents(fileid)) 

#for fileid in treebank.fileids():
#    source.append(treebank.sents(fileid)) 

#for fileid in conll2000.fileids():
#    source.append(conll2000.sents(fileid))     


In [5]:
len(source)

10788

In [6]:
from collections import defaultdict
import nltk

In [7]:
import spacy

nlp = spacy.load('en_core_web_lg', disable=["ner", 'parser'])

In [8]:
from collections import namedtuple

Token = namedtuple('Token', 'text point_after pos tag lemma')

def parse(sent):    
    doc = nlp(' '.join(list(map(lambda w: w[0], sent))))
    return [Token(token.text, False, token.pos_, token.tag_, token.lemma_) \
            for token in doc]

def convert_to_tokens(sent):
            
    tagged_sent = []         
    tokens = parse(sent)
    compound = None
    merged = 0
    i = 0
    j = 0
    while i < len(tokens) or j < len(sent): 
        orig_token = sent[i] if i < len(sent) else sent[len(sent) - 1]
        token = tokens[j] if j < len(tokens) else tokens[len(tokens) - 1]           
        if orig_token[0] == token.text:                
            tagged_sent.append(token._replace(point_after = orig_token[1]))
            j += 1
            i += 1
        elif orig_token[0] in token.text:                
            if not compound:
                compound = token
            else:
                compound = compound._replace(point_after = orig_token[1])
            merged += len(orig_token[0])    
            if merged == len(compound.text):                     
                j += 1
                tagged_sent.append(compound)
                compound = None                
                merged = 0
            i += 1    
        elif token.text in orig_token[0]:                
            if not compound:
                compound = orig_token[0]

            merged += len(token.text)
            if merged == len(compound): 
                i += 1
                tagged_sent.append(token._replace(point_after = orig_token[1]))                    
                compound = None
                merged = 0
            else:
                tagged_sent.append(token)
            j += 1
        else:                
            print(orig_token, token.text)
            print(list(map(lambda w: w[0], tagged_sent)))
            print(list(map(lambda w: w[0], sent)))
            assert False                                    

    return tagged_sent

In [None]:
random.seed(273757)
corpus = []

def tag_tokens(tokens):
    #doc = nlp(' '.join(tokens))
    
    return [[token, False] for token in tokens]

with open('run-on-corpus.txt', 'w') as f_corpus:
    for sents in source:        
        run_on_num = rand_run_on_num()
        buff = []
        prev_offset = 0      
        for sent in sents[1:]:
            if len(sent) < 5:
                continue
                
            buff.extend(tag_tokens(sent))            
                
            if prev_offset > 0:                
                if buff[prev_offset - 1][0] in '.?;!':
                    del buff[prev_offset - 1]
                    buff[prev_offset - 2][1] = True                
                   
                    if not buff[prev_offset - 1][0].isupper():                
                        lwr = random.random()                
                        if lwr < 0.93:
                            buff[prev_offset - 1][0] = buff[prev_offset - 1][0].lower()
            
            prev_offset = len(buff)            
            
            if run_on_num == 0:
                print(buff, file = f_corpus, end = "\n")                
                corpus.append(convert_to_tokens(buff))
                run_on_num = rand_run_on_num()
                prev_offset = 0
                buff = []                               
                       
            run_on_num -= 1       

In [None]:
len(corpus)

In [None]:
bigrams = defaultdict(float)
trigrams = defaultdict(float)

for g in nltk.ngrams([w.lower() for sents in source for sent in sents for w in sent], 2):        
    bigrams[g] += 1.0
                
for g in nltk.ngrams([w.lower() for sents in source for sent in sents for w in sent], 3):
    trigrams[g] += 1.0    

#for g in nltk.ngrams([w.text.lower() for sent in convert_to_tokens(run_on_js) for w in sent], 2):        
   # bigrams[g] += 1.0
    

#for g in nltk.ngrams([w.text.lower() for sent in convert_to_tokens(run_on_js) for w in sent], 3):
#    trigrams[g] += 1.0      

In [None]:
def update_freq(ngrams):
    n = sum(ngrams.values())
    for k, v in ngrams.items():
        ngrams[k] = v/n

In [None]:
update_freq(bigrams)
update_freq(trigrams)

In [None]:
sorted(bigrams.items(), key=lambda kv: kv[1], reverse = True)

In [None]:
random.shuffle(corpus)
random.shuffle(corpus)
random.shuffle(corpus)

train_index = int(0.7 * len(corpus))

train_data = corpus[: train_index]
test_data = corpus[train_index: ]

In [None]:
import math

def baseline(data):
    result_data = []
    for sent in data:
        result_sent = []
        last_point = 0
        for i, word in enumerate(sent):            
            if (i - last_point) > 3 and i < (len(sent) - 1):
                pbigram = (word[0].lower(), '.')                
                bigram = (word[0].lower(), sent[i+1][0].lower())                                    
                if (sent[i + 1][0][0:1].isupper()):
                    result_sent.append([word[0], True])
                elif math.log(bigrams[pbigram] + 0.000000000001) > math.log(bigrams[bigram] + 0.001):    
                    #if not word[1]:
                         #print('false positive', pbigram, bigram, bigrams[pbigram], bigrams[bigram], word, sent[i + 1])
                    result_sent.append([word[0], True])
                    last_point = i
                else:   
                    #if word[1]:
                    #     print('false negative', pbigram, bigram, bigrams[pbigram], bigrams[bigram], word, sent[i + 1])
                    result_sent.append([word[0], False])                                    
            else:
                #if word[1]:
                #    print('false negative', word, sent[i + 1])
                result_sent.append([word[0], False])
            
        result_data.append(result_sent)   
    
    return result_data

In [None]:
result = baseline(test_data)

In [None]:
def labels_vec(data):
    return [word[1] for sent in data for word in sent]

In [None]:
from sklearn.metrics import classification_report


print(classification_report(labels_vec(test_data), labels_vec(result)))

In [None]:
print(classification_report(labels_vec(run_on_js), labels_vec(baseline(run_on_js))))

In [None]:
import sys

def token_features(title, token, fdic):
    #fdic[title + 'is_upper'] = token.text.isupper() if not (token is None) else False
    fdic[title + 'is_title'] = token.text.istitle() if not (token is None) else False
    #fdic[title + 'is_punct'] = token.token_.is_punct if token else False
    fdic[title + 'tag'] = token.tag_ if not (token is None) else ''
    fdic[title + 'word']  = token.text.lower() if not (token is None) else ''
    #fdic[title + 'lemma']  = token.token_.lemma_ if token else ''
    #fdic[title + 'dep'] = token.token_.dep_ if not (token is None) else ''
    #fdic[title + 'head'] = token.token_.head.text.lower() if not (token is None) else ''
    #fdic[title + 'head_tag'] = token.token_.head.tag_ if not (token is None) else ''


def extract_sent_features(sent):
    features = []
    prev1_token = None
    prev2_token = None
    prev3_token = None
    next1_token = sent[1] #sent is always grater than 5
    next2_token = sent[2]
    next3_token = sent[3]
    for i, token in enumerate(sent):
        fdic = {}        
        token_features('curr', token, fdic)
        token_features('prev1', prev1_token, fdic)
        token_features('prev2', prev2_token, fdic)
        token_features('prev3', prev3_token, fdic)
        token_features('next1', next1_token, fdic)
        token_features('next2', next1_token, fdic)
        token_features('next3', next1_token, fdic)
        
        fdic['passed'] = len(sent)/(i+1)
        fdic['left'] = len(sent)/(len(sent) - i +1)
        
        fdic['bigram'] = math.log((0.0 if i == (len(sent) - 1) else bigrams[(token.text.lower(), sent[i+1].text.lower())]) +  sys.float_info.epsilon * 1000000)
        fdic['pbigram'] = math.log((0.0 if i == (len(sent) - 1) else bigrams[(token.text.lower(), '.')]) +  sys.float_info.epsilon * 1000000)        
        fdic['pbigram2'] = math.log((0.0 if i == (len(sent) - 1) else bigrams[('.', sent[i+1].text.lower())]) +  sys.float_info.epsilon * 1000000)      
        fdic['trigram'] = math.log((0.0 if i > (len(sent) - 3) else trigrams[(token.text.lower(), sent[i+1].text.lower(), sent[i+2].text.lower())]) +  sys.float_info.epsilon * 1000000)
        fdic['ptrigram'] = math.log((0.0 if i == (len(sent) - 1) else trigrams[(token.text.lower(), '.', sent[i+1].text.lower())]) + sys.float_info.epsilon * 1000000)
        fdic['ptrigram2'] = math.log((0.0 if i > (len(sent) - 3) else trigrams[('.', sent[i+1].text.lower(), sent[i+2].text.lower())]) +  sys.float_info.epsilon * 1000000)
        fdic['ptrigram3'] = math.log((trigrams[(sent[i-1].text.lower(), token.text.lower(), '.')] if i > 0 else 0.0) + sys.float_info.epsilon * 1000000)         
        
        features.append(fdic)
                
        prev3_token = prev2_token
        prev2_token = prev1_token
        prev1_token = token            
        
        next1_token = next2_token
        next2_token = next3_token
        next3_token = sent[i + 1] if i < (len(sent) - 2) else None
    
    return features

def extract_features(sents):
    features = []
    for sent in sents:
        features.extend(extract_sent_features(sent))
    return features 
    

In [None]:
features = extract_features(train_data)

In [None]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()

vectorizer.fit(features)

feature_vecs = vectorizer.transform(features)

In [None]:
from sklearn.linear_model import LogisticRegression 

#logreg = LogisticRegression(random_state=26,  solver='lbfgs', multi_class="multinomial", max_iter=2000)
logreg = LogisticRegression(random_state=23466, solver='lbfgs', max_iter=3000)

                               

logreg.fit(feature_vecs, labels_vec(train_data))

In [None]:
def predict(data):
    vec = vectorizer.transform(extract_features(data))
    return logreg.predict(vec)

In [None]:
test_predicted = predict(test_data)

In [None]:
print(classification_report(labels_vec(test_data), test_predicted))

In [None]:
run_on_js_tokenized = [convert_to_tokens(sent) for sent in run_on_js]

In [None]:
print(sum([w[1] for sent in run_on_js for w in sent]))
print(sum([t.point_after for sent in run_on_js_tokenized for t in sent]))

In [None]:
print(predict([run_on_js_tokenized[0]]))
print(run_on_js_tokenized[0][-8])

In [None]:
run_on_js[0]

In [None]:
print(classification_report(labels_vec(run_on_js_tokenized), predict(run_on_js_tokenized)))

In [None]:
list(zip([t.point_after for sent in run_on_js_tokenized for t in sent], predict(run_on_js_tokenized)))