In [1]:
# -*- coding: utf-8 -*- 

import pandas as pd
import numpy as np
import json
import io
import random

In [2]:
train_data = []
with io.open('train_data.json', 'r', encoding='utf8') as f:
    for line in f.readlines():
        d = json.loads(line)
        train_data.append(d)

len(train_data)

22676

In [3]:
random.shuffle(train_data)

In [4]:
end_of_sentences = set()
for paragraph in train_data:
    for sentence in paragraph['Sentences']:
        end_of_sentences.add(sentence[-1])

In [5]:
for symbol in end_of_sentences:
    print(symbol)

!
"
…
.
»
?


In [6]:
ends = list(end_of_sentences)

In [7]:
def get_all_stoppers(paragraph):
    return [{'Pos': i, 'Mark': c} for i, c in enumerate(paragraph) if c in end_of_sentences]

In [8]:
def get_true_stoppers(paragraph, sentences):
    pos = 0
    stoppers = []
    
    for s in sentences:
        pos += len(s) - 1
        stoppers.append({'Pos' : pos, 'Mark' : s[-1]})
        pos += 1
        while pos < len(paragraph) and paragraph[pos].isspace():
            pos += 1
        
    return stoppers

In [9]:
def sentence_length_backward(paragraph, i):
    if i < 0:
        return 0
    
    n_words = 0
    flag = paragraph[i].isspace()
    
    while i >= 0 and (paragraph[i].isalnum() or paragraph[i].isspace()):
        if paragraph[i].isspace() and not flag:
            n_words += 1
        flag = paragraph[i].isspace()
        i -= 1
    
    if not flag:
        n_words += 1
        
    return n_words

In [10]:
def sentence_length_forward(paragraph, i):
    if i >= len(paragraph):
        return 0
    
    n_words = 0
    flag = paragraph[i].isspace()
    
    while i < len(paragraph) and (paragraph[i].isalnum() or paragraph[i].isspace()):
        if paragraph[i].isspace() and not flag:
            n_words += 1
        flag = paragraph[i].isspace()
        i += 1
    
    if not flag:
        n_words += 1
    
    return n_words

In [11]:
def word_length_backward(paragraph, i):
    if i < 0:
        return 0
    
    wordlen = 0
    while i >= 0 and paragraph[i].isalpha():
        wordlen += 1
        i -= 1

In [12]:
def features_endpoint(paragraph, mark, pos):
    features = [int(mark == e) for e in ends]
    features.append(pos)
    
    # prev symbols info
    idx_prev = pos - 1
    
    f = idx_prev >= 0 and paragraph[idx_prev].isspace()
    features.append(int(f))
    
    while idx_prev >= 0 and paragraph[idx_prev].isspace():
        idx_prev -= 1
    
    if idx_prev >= 0:
        f = []
        
        f.append(0)
        f.append(u'a' <= paragraph[idx_prev] <= u'z' or u'A' <= paragraph[idx_prev] <= u'Z')
        f.append(paragraph[idx_prev].isupper())
        f.append(paragraph[idx_prev].islower())
        f.append(paragraph[idx_prev].isdigit())
        f.append(u':' == paragraph[idx_prev])
        f.append(u'»' == paragraph[idx_prev])
        f.append(u'"' == paragraph[idx_prev])
        f.append(paragraph[idx_prev] in [u'—', u'-'])
        f.append(paragraph[idx_prev] in [u')', u'}', u']'])
        
        wordlen, i = 0, idx_prev
        while i >= 0 and paragraph[i].isalpha():
            wordlen += 1
            i -= 1
        f.append(wordlen)
        
        wordlen, i = 0, idx_prev
        while i >= 0 and \
            (paragraph[i].isalpha() or paragraph[i].isnumeric() or paragraph[i] in [u'—', u'-']):
            wordlen += 1
            i -= 1
        f.append(wordlen)
        
        if i >= 0:
            f.append(paragraph[i] in [u'(', u'{', u'['])
            f.append(u'«' == paragraph[i])
        else:
            f.extend([0, 0])
            
        f.append(paragraph[idx_prev] in end_of_sentences)
        f.append(not reduce(lambda x, y: x or y, f))
        
        features.extend(f)
    else:
        features.extend([1] + [0] * 15)
    
    # next symbols info
    idx_next = pos + 1
    
    f = idx_next < len(paragraph) and paragraph[idx_next].isspace()
    features.append(int(f))
    
    while idx_next < len(paragraph) and paragraph[idx_next].isspace():
        idx_next += 1
        
    if idx_next < len(paragraph):
        f = []
        
        f.append(0)
        f.append(u'a' <= paragraph[idx_next] <= u'z' or u'A' <= paragraph[idx_next] <= u'Z')
        f.append(paragraph[idx_next].isupper())
        f.append(paragraph[idx_next].islower())
        f.append(paragraph[idx_next].isdigit())
        f.append(u':' == paragraph[idx_next])
        f.append(u'«' == paragraph[idx_next])
        f.append(u'"' == paragraph[idx_next])
        f.append(paragraph[idx_next] in [u'—', u'-'])
        f.append(paragraph[idx_next] in [u'(', u'{', u'['])
        
        wordlen, i = 0, idx_next
        while i < len(paragraph) and paragraph[i].isalpha():
            wordlen += 1
            i += 1
        f.append(wordlen)
        
        wordlen, i = 0, idx_next
        while i < len(paragraph) and \
            (paragraph[i].isalpha() or paragraph[i].isnumeric() or paragraph[i] in [u'—', u'-']):
            wordlen += 1
            i += 1
        f.append(wordlen)
        
        if i < len(paragraph):
            f.append(paragraph[i] in [u')', u'}', u']'])
            f.append(u'»' == paragraph[i])
        else:
            f.extend([0, 0])
            
        f.append(paragraph[idx_next] in end_of_sentences)
        f.append(not reduce(lambda x, y: x or y, f))
        
        features.extend(f)
    else:
        features.extend([1] + [0] * 15)
        
    if pos - 5 >= 0:
        f = []
        f.append(paragraph[pos-4:pos+1] in [u'т.д.', u'т.п.', u'т.к.', u'н.э.'] and paragraph[pos-5].isspace())
        f.append(paragraph[pos-3:pos+1] in [u'гг.', u'вв.', u'ул.'] and paragraph[pos-5].isspace())
        f.append(paragraph[pos-2:pos+1] in [u'г.', u'в.'] and paragraph[pos-5].isspace())
        features.append(reduce(lambda x, y: x or y, f))
    else:
        features.append(0)
    
    return features

In [13]:
X_train, Y_train = [], []

for e in train_data[:20000]:
    stoppers_nice = get_true_stoppers(e['Paragraph'], e['Sentences'])
    stoppers_all  = get_all_stoppers(e['Paragraph'])
    
    for s in stoppers_all:
        X_train.append(features_endpoint(e['Paragraph'], s['Mark'], s['Pos']))
        Y_train.append(int(s in stoppers_nice))
        
X_train = np.asarray(X_train)
Y_train = np.asarray(Y_train)

In [14]:
X_test, Y_test = [], []

for e in train_data[20000:]:
    stoppers_nice = get_true_stoppers(e['Paragraph'], e['Sentences'])
    stoppers_all  = get_all_stoppers(e['Paragraph'])
    
    for s in stoppers_all:
        X_test.append(features_endpoint(e['Paragraph'], s['Mark'], s['Pos']))
        Y_test.append(int(s in stoppers_nice))
        
X_test = np.asarray(X_test)
Y_test = np.asarray(Y_test)

In [15]:
from collections import OrderedDict
from operator import itemgetter
from sklearn.ensemble import RandomForestClassifier

cl = RandomForestClassifier(n_estimators=100, max_depth=13)
cl.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
Y_pred = cl.predict(X_test)

In [17]:
from sklearn.metrics import roc_auc_score

roc_auc_score(Y_test, Y_pred)

0.98236317674730578