In [1]:
import pandas as pd
import numpy as np
import json
import io
import pickle

## Загрузка данных

In [5]:
def load(file_name):
    with open(file_name, 'r') as f:
        loaded = pickle.load(f)
    return loaded

def dump(obj, file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(obj, f)

In [6]:
train_data = []
with io.open('train_data.json','r',encoding='utf8') as f:
    for line in f.readlines():
        d = json.loads(line)
        train_data.append(d)

In [7]:
test_data = []
with io.open('test_data.json','r',encoding='utf8') as f:
    for line in f.readlines():
        d = json.loads(line)
        test_data.append(d)

In [8]:
# extract marks from train (true ends and potential ones)
symbols = {u'!', u'"', u'.', u'?', u'\xbb', u'\u2026'}

def find_occurences(string, symbols):
    return [i for i, letter in enumerate(string) if letter in symbols]

all_marks = []
for j, item in enumerate(train_data):
    sentences = train_data[j][u'Sentences']
    paragraph = train_data[j][u'Paragraph']
    number_of_sentences = len(sentences)
    lengths = [len(sentences[i]) for i in range(number_of_sentences)]
    end_positions = np.cumsum(lengths) + [i - 1 for i in range(number_of_sentences)]
    end_symbols = [paragraph[i] for i in end_positions]
    # real_end_symbols = [sentences[i][-1] for i in range(number_of_sentences)]
    potential_end_positions = find_occurences(paragraph, symbols)
    all_marks.append((paragraph, end_positions, potential_end_positions))
    
    

In [9]:
def train_marks_to_test_format(all_marks):

    index = 0
    bool_by_index = {}
    all_paragraphs = []

    for item in all_marks:
        paragraph = item[0]
        end_mark_positions = item[1]
        mark_positions = item[2]
        paragraph_with_marks = {u'Marks': [], u'Paragraph': paragraph}
    
        for pos in mark_positions:
            index +=1
            bool_by_index[index] = (pos in end_mark_positions)
            paragraph_with_marks[u'Marks'].append({u'Index': index, u'Mark': paragraph[pos], u'Pos': pos})
        
        all_paragraphs.append(paragraph_with_marks)
        
    return all_paragraphs, bool_by_index
    

In [10]:
train_marks_in_test_format, bool_by_index_train = train_marks_to_test_format(all_marks)

In [11]:
def test_format_to_df(data_list):
    list_of_dictionaries = []
    for i, item in enumerate(data_list):
        for mark in item[u'Marks']:
            d = mark
            d[u'Paragraph_id'] = i
            d['Paragraph'] = item[u'Paragraph']
            list_of_dictionaries.append(d)
            
    paragraphs_df = pd.DataFrame(list_of_dictionaries)
    
    return paragraphs_df


In [12]:
train_df = test_format_to_df(train_marks_in_test_format)
train_df['is_end'] = bool_by_index_train
test_df = test_format_to_df(test_data)

## Фичи

In [14]:
def add_fives(df):
    next_five = []
    last_five = []
    given = df[['Pos', 'Paragraph']]
    for index, row in given.iterrows():
        paragraph = row['Paragraph']
        length = len(paragraph)
        pos = row['Pos']
        last_five.append(paragraph[max(0, pos-5):pos])
        next_five.append(paragraph[pos+1:min(pos+1+5, length)])
    df['last_five'] = last_five
    df['next_five'] = next_five
    df.loc[df['next_five'] == '', 'next_five'] = np.nan
    df.loc[df['last_five'] == '', 'last_five'] = np.nan
    return df

In [15]:
train_df = add_fives(train_df)
test_df = add_fives(test_df)

In [17]:
def scheme(series):
    capital_cyrillic = u'[ЙЦУКЕНГШЩЗХЪЁФЫВАПРОЛДЖЭЯЧСМИТЬБЮ]'
    cyrillic = u'[йцукенгшщзхъёфывапролджэячсмитьбюўѢ]'
    chinese = u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]'
    japanese = u'[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uff9f\u4e00-\u9faf\u3400-\u4dbf]'
    capital_greek = u'[ΑΓΒΕΔΗΖΙΘΛΚΝΜΟΞΡΠΣΥΤΧΦΩΨ]'
    greek = u'[αγβεδηζιθλκνμοξρπσςυὐύτχφωψὰѲῶᾠέάἀὀ]'
    arabic = u'[\u0627-\u064a]'
    hebrew = u'[־-״]'
    korean = u'[천하\u3131-\ucb4c]'
    almost_latin = u'[ßáàãâǎåäçčéèëêíìἰïĭόіїɪǐīîñóòõōôöùūøûúýüÿńšəðІʀἙἘɣ]'
    capital_almost_latin = u'[ÁÀÃÂÅÄÇÉÈËÊÍÌÏÎÑÓÒÕÔÖÙØÛÚÝÜ]'
    math = u'[±×·¬\+]'
    digits = u'[³²¹¾½¼∞]'
    strange = u'[গअোপदԿावাխ]'
   
    schemed_series = series.replace(np.nan, '')
    
    schemed_series = schemed_series.str.replace(capital_cyrillic, u'К')
    schemed_series = schemed_series.str.replace(cyrillic, u'к')
    schemed_series = schemed_series.str.replace('[A-Z]', 'L')
    schemed_series = schemed_series.str.replace('[a-z]', 'l')
    schemed_series = schemed_series.str.replace(capital_almost_latin, 'L')
    schemed_series = schemed_series.str.replace(almost_latin, 'l')
    schemed_series = schemed_series.str.replace(chinese, 'c')
    schemed_series = schemed_series.str.replace(japanese, 'j')
    schemed_series = schemed_series.str.replace(arabic, 'a')
    schemed_series = schemed_series.str.replace(capital_greek, u'G')
    schemed_series = schemed_series.str.replace(greek, u'g')
    schemed_series = schemed_series.str.replace(korean, u'k')
    schemed_series = schemed_series.str.replace(hebrew, u'h')
    schemed_series = schemed_series.str.replace(math, 'm')
    schemed_series = schemed_series.str.replace(strange, 's')
    schemed_series = schemed_series.str.replace('[0-9]', '0')
    schemed_series = schemed_series.str.replace(digits, '0')
    
    #schemed_series = series.replace('', np.nan)
    
    
    return schemed_series

def df_scheme(df):
    df['next_five_scheme'] = scheme(df['next_five'])
    df['last_five_scheme'] = scheme(df['last_five'])
    return df

In [18]:
train_df = df_scheme(train_df)
test_df = df_scheme(test_df)

In [20]:
def add_schemes(df):
    df['next_four_scheme'] = df['next_five_scheme'].str.slice(0,4)
    df['next_three_scheme'] = df['next_five_scheme'].str.slice(0,3)
    df['next_two_scheme'] = df['next_five_scheme'].str.slice(0,2)
    df['next_one_scheme'] = df['next_five_scheme'].str.slice(0,1)
    
    df['last_one_scheme'] = df['last_five_scheme'].str.slice(4,5)
    df['last_two_scheme'] = df['last_five_scheme'].str.slice(3,5)
    df['last_three_scheme'] = df['last_five_scheme'].str.slice(2,5)
    df['last_four_scheme'] = df['last_five_scheme'].str.slice(1,5)
    
    return df

In [21]:
test_df = add_schemes(test_df)
train_df = add_schemes(train_df) 

In [24]:
schemes = ['next_five_scheme','next_four_scheme', 'next_three_scheme','next_two_scheme','next_one_scheme','last_one_scheme', 'last_two_scheme', 'last_five_scheme', 'last_three_scheme', 'last_five_scheme', 'last_four_scheme','last_five_scheme']

In [None]:
test_df_ohe = pd.get_dummies(test_df, columns=schemes)
train_df_ohe = pd.get_dummies(train_df, columns=schemes)

In [None]:
def delete_double_columns(df):
    df = df.loc[:, ~df.columns.duplicated()]
    return df

In [None]:
test_df_ohe = delete_double_columns(test_df_ohe)
train_df_ohe = delete_double_columns(train_df_ohe)

In [None]:
test_ohe_features = test_df_ohe.columns[7:]
train_ohe_features = train_df_ohe.columns[8:]

In [None]:
def extract_frequent_features(df_ohe, ohe_features):
    freq_ohe_features = df_ohe[ohe_features].sum()>1
    return freq_ohe_features[freq_ohe_features==True].index.unique()

In [None]:
freq_test_ohe_features = extract_frequent_features(test_df_ohe, test_ohe_features)
freq_train_ohe_features = extract_frequent_features(train_df_ohe, train_ohe_features)
common_freq_features = set(freq_train_ohe_features).intersection(set(freq_test_ohe_features))
common_freq_features = list(common_freq_features)


In [30]:
plain_features = [u'Mark_!', u'Mark_"', u'Mark_.', u'Mark_?', u'Mark_»', u'Mark_…', u'is_last_mark', u'is_first_mark', u'only_mark', u'distance_to_next', u'distance_to_last', u'beginning']

In [18]:
def add_ohe_mark_features(df):
    new_df = pd.get_dummies(df, columns=['Mark'])
    new_df['Mark'] = df['Mark']
    
    return new_df

def add_paragraph_boundaries(df):
    df['next_id'] = df['Paragraph_id'].shift(-1)
    df['last_id'] = df['Paragraph_id'].shift(+1)
    
    df['is_last_mark'] = (df['Paragraph_id'] != df['next_id'])
    df['is_first_mark'] = (df['Paragraph_id'] != df['last_id'])
    
    df['only_mark'] = df['is_first_mark'] & df['is_last_mark']
    
    df = df.drop(['next_id', 'last_id'],axis=1)
    
    return df

def add_distances(df):
    df['distance_to_next'] = df['Pos'].shift(-1) - df['Pos']
    df.loc[(df['is_last_mark']==1),['distance_to_next']] = np.inf
    
    df['distance_to_last'] = df['Pos'] - df['Pos'].shift(+1) 
    df.loc[(df['is_first_mark']==1),['distance_to_last']] = np.inf
    
    return df

def next_last_distance_max(df, train_df):
    max_distance_to_next_train = train_df['distance_to_next'].replace(np.inf, np.nan).max()
    max_distance_to_next = df['distance_to_next'].replace(np.inf, np.nan).max()
    max_distance_to_next = max(max_distance_to_next_train, max_distance_to_next)
    
    max_distance_to_last_train = train_df['distance_to_last'].replace(np.inf, np.nan).max()
    max_distance_to_last = df['distance_to_last'].replace(np.inf, np.nan).max()
    max_distance_to_last = max(max_distance_to_last_train, max_distance_to_last)
    
    return max_distance_to_next, max_distance_to_last

def replace_inf_in_column(df, column, value):
    df[column] = df[column].replace(np.inf, value)
    
    return df

def add_elementary_features(df):
    df['beginning'] = (df['Pos'] <=1)
    df = add_paragraph_boundaries(df)
    df = add_distances(df)
    df = add_ohe_mark_features(df)
    return df

def add_plain_features(train_df, test_df):
    
    train_df = add_elementary_features(train_df)
    test_df = add_elementary_features(test_df)    
       
    next_max, last_max = next_last_distance_max(test_df, train_df)
    
    for df in ([train_df, test_df]):
        df = replace_inf_in_column(df, 'distance_to_next', next_max)
        df = replace_inf_in_column(df, 'distance_to_last', last_max)
    
    
    return train_df, test_df

In [None]:
train_df_ohe, test_df_ohe = add_plain_features(train_df_ohe, test_df_ohe)

In [37]:
nan_features = {'next_five_scheme_', 'next_four_scheme_', 'next_three_scheme_', 
                'next_two_scheme_', 'next_one_scheme_', 'last_one_scheme_',
                'last_two_scheme_', 'last_three_scheme_', 'last_four_scheme_',
                'last_five_scheme_'}
features = list(set(plain_features + common_freq_features) - nan_features) 

К сожалению, я не зафиксировала random state, но запиклила порядок фичей и получившийся лес. 
Поэтому использовавшийся код переведен в markdown и заменен запикленной моделью. Надеюсь, это не страшно.

In [36]:
common_freq_features = load('common_freq_features.pkl')
features = list(set(plain_features + common_freq_features))

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

clf.fit(train_df_ohe[features].values, train_df_ohe['is_end'].astype(int).values) 

test_df_ohe['rf_predicted'] = clf.predict(test_df_ohe[features].values)


In [None]:
clf = load('rfclassifier.pkl')
test_df_ohe['rf_predicted'] = clf.predict(test_df_ohe[features].values)

## Добавляем эвристики

In [2]:
def rf_predict(df):
    df['predicted'] = df['rf_predicted']
    df.loc[df['ok_ending'] == True, 'predicted'] = True
    df.loc[df['problematic_abbreviation'] == True, 'predicted'] = False
    return df

In [3]:
def add_problematic_abbreviations(df):
    te = df['last_five'].str.contains(u'т\. ?[ке]$', regex=True)
    
    languages = [u' англ', u' рус', u' рум', u' лат', u' яп', u' кит', u'(англ', u'(рус', u'(рум', u'(лат', u'(яп', u'(кит']
    language = pd.Series(False, index = df.index)
    for lang in languages:
        language = language | df['last_five'].str.endswith(lang)
        
    red = df['last_five'].str.contains(u'[ (\.]ред$', regex=True)
    sm = df['last_five'].str.contains(u'[ (]См$', regex=True)
    napr = df['last_five'].str.contains(u'[ (]напр$', regex=True)   
    pic = df['last_five'].str.contains(u'[ (]рис$', regex=True) & df['next_five'].str.contains(u'^\d+', regex=True)
    
    cityname = u'^ ?[ЙЦУКЕНГШЩЗХЪЁФЫВАПРОЛДЖЭЯЧСМИТЬБЮ]'
    city = df['last_five'].str.endswith(u'в г') & df['next_five'].str.contains(cityname, regex=True)
    
    df['problematic_abbreviation'] = te | language | red | sm | napr | pic | city
    
    return df
    
def add_ok_endings(df):
    new_sentence = df['next_five'].str.contains(u'^ [ЙЦУКЕНГШЩЗХЪЁФЫВАПРОЛДЖЭЯЧСМИТЬБЮ]', regex=True)
    
    recipe = df['last_five'].str.contains(u' ?\d+г$', regex=True) & new_sentence
    volume = df['last_five'].str.contains(u' ?Т. ?\d+$', regex=True) & new_sentence
    quote = df['last_five'].str.endswith(u'»') & new_sentence & df['Mark_.']
    
    df['ok_ending'] = recipe | volume | quote
    
    return df

In [16]:
test_df = add_problematic_abbreviations(test_df)
train_df = add_problematic_abbreviations(train_df)

In [19]:
train_df, test_df = add_plain_features(train_df, test_df)

In [20]:
test_df = add_ok_endings(test_df)
train_df = add_ok_endings(train_df)

In [23]:
test_df['rf_predicted'] = pd.read_csv('submission_rf_freq_schemes.csv')['Mark'] # comment to use computed rf_predicted

In [24]:
test_df = rf_predict(test_df)

submission = pd.DataFrame()
submission['Id'] = test_df['Index']
submission['Mark'] = test_df['predicted'].astype(int)
submission.to_csv('solution_Kuznetsova.csv', index=False, header=True)