In [38]:
from catboost import Pool, CatBoostClassifier
from collections import defaultdict
from pymystem3 import Mystem
from os import walk

In [398]:
m = Mystem()
m.analyze('«Роснефть», «Ведомости», ыва')

[{'text': '«'},
 {'analysis': [{'gr': 'S,ед,жен,неод=(вин|им)', 'lex': 'роснефть', 'wt': 1}],
  'text': 'Роснефть'},
 {'text': '», «'},
 {'analysis': [{'gr': 'S,жен,неод=(пр,ед|вин,мн|дат,ед|род,ед|им,мн)',
    'lex': 'ведомость',
    'wt': 1}],
  'text': 'Ведомости'},
 {'text': '», '},
 {'analysis': [{'gr': 'S,муж,неод=(пр,мн|пр,ед|вин,мн|вин,ед|дат,мн|дат,ед|род,мн|род,ед|твор,мн|твор,ед|им,мн|им,ед)',
    'lex': 'ыва',
    'qual': 'bastard',
    'wt': 0.2000164368}],
  'text': 'ыва'},
 {'text': '\n'}]

In [57]:
def readTrain():
    (_, _, filenames) = next(walk('Collection3'))
    lines = []
    for name in filenames:
        if 'ann' in name:
            with open('Collection3/' + name) as f:
                lines += [line for line in f.readlines()]
    return lines

In [58]:
def readRaw():
    (_, _, filenames) = next(walk('Collection3'))
    lines = []
    for name in filenames:
        if 'ann' not in name:
            with open('Collection3/' + name) as f:
                lines += [line for line in f.readlines()]
    return lines

In [247]:
import gensim

w2v_fpath = "all.norm-sz100-w10-cb0-it1-min100.w2v"
w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v_fpath, binary=True, unicode_errors='ignore')

In [549]:
def toW2vTag(tag):
    if tag == 'S':
        return 'NOUN'
    if tag == 'A':
        return 'ADJ'
    return tag

def analyzeWord(word, isUsed):
    features = []
    wordClean = re.sub(r'[^а-яА-Я]', r'', word)
    if len(wordClean) < 3:
        raise Exception()
    raw = m.analyze(wordClean)[0]['analysis'][0]
    tag = raw['gr'][0]
    isNoun = tag == 'S'
    features.append(isNoun)
    vowels = set('аоиеёэыуюя')
    isName = 'имя' in ['gr'] or 'фам' in raw['gr']
    features.append(isName)
    isCapital = np.mean([char.isupper() for char in word])
    vowelsCnt = np.mean([letter in vowels for letter in wordClean.lower()])
    features.append(isCapital)
    isInBrackets = word == wordClean
    features.append(isInBrackets)
    features.append(vowelsCnt)
    w2v_name = 0
    w2v_surname = 0
    w2v_org = 0
    w2v_person = 0
    w2v_pers = 0
    w2v_company = 0
    w2v_ved = 0
    w2v_corp = 0
    w2v_tag = toW2vTag(tag)
    try:
        w2v_word =  raw['lex'].lower()
    except:
        w2v_word = word.lower()
    try: w2v_pers = w2v.similarity('персона', w2v_word)
    except Exception as e:
        pass
    try: w2v_surname = w2v.similarity('политик', w2v_word)
    except Exception as e:
        pass
    try: w2v_name = w2v.similarity('имя', w2v_word)
    except: pass
    try: w2v_org = w2v.similarity('организация', w2v_word)
    except: pass
    try: w2v_person = w2v.similarity('человек', w2v_word)
    except: pass
    try: w2v_company = w2v.similarity('компания', w2v_word)
    except: pass
    try: w2v_ved = w2v.similarity('ведомство', w2v_word)
    except: pass
    try: w2v_corp = w2v.similarity('новости', w2v_word)
    except: pass
    features.append(w2v_org)
    features.append(w2v_pers)
    return features

In [550]:
import numpy as np

def analyzeSentence(sentence, isUsed):
    features = []
    for word in sentence.split():
        try:
            if np.random.random() < 0.3:
                features.append(analyzeWord(word, isUsed))
        except:
            continue
    return features

In [551]:
from sklearn.model_selection import train_test_split

train_data = []
train_label = []
isUsed = defaultdict(bool)
m = Mystem()

for line in readTrain():
    label = line.split()[1]
    for word in line.split()[4:]:
        try: 
            train_data.append(analyzeWord(word, isUsed))
            train_label.append(label)
        except Exception as e:
            continue
    
for line in readRaw():
    label = 'LOC'
    features = analyzeSentence(line, isUsed)
    train_data += features
    train_label += [label for word in features]

cat_features = [0]

X_train, X_test, y_train, y_test = train_test_split(train_data, train_label, test_size=0.2, random_state=777)

In [554]:
cat_features = [0, 1, 3]

train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features=cat_features)
    
test_dataset = Pool(data=X_test,
                     label=y_test,
                     cat_features=cat_features)
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=3000,
                           depth=7,
                           l2_leaf_reg= 10
                           , metric_period=100,
                           eval_metric='AUC',
                           loss_function='MultiClass')
# Fit model with `use_best_model=True`

model.fit(train_dataset,
          eval_set=test_dataset,
          use_best_model=True)

print("Count of trees in model = {}".format(model.tree_count_))

0:	test: 0.8920744	best: 0.8920744 (0)	total: 24.7ms	remaining: 1m 14s
100:	test: 0.9300495	best: 0.9300495 (100)	total: 1.27s	remaining: 36.6s
200:	test: 0.9385001	best: 0.9385001 (200)	total: 2.49s	remaining: 34.7s
300:	test: 0.9424559	best: 0.9424559 (300)	total: 3.73s	remaining: 33.5s
400:	test: 0.9453933	best: 0.9453933 (400)	total: 4.96s	remaining: 32.2s
500:	test: 0.9477265	best: 0.9477265 (500)	total: 6.21s	remaining: 31s
600:	test: 0.9492319	best: 0.9492319 (600)	total: 7.44s	remaining: 29.7s
700:	test: 0.9506031	best: 0.9506031 (700)	total: 8.67s	remaining: 28.4s
800:	test: 0.9516157	best: 0.9516157 (800)	total: 9.92s	remaining: 27.2s
900:	test: 0.9525345	best: 0.9525345 (900)	total: 11.2s	remaining: 26s
1000:	test: 0.9532962	best: 0.9532962 (1000)	total: 12.4s	remaining: 24.8s
1100:	test: 0.9539238	best: 0.9539238 (1100)	total: 13.7s	remaining: 23.6s
1200:	test: 0.9545369	best: 0.9545369 (1200)	total: 14.9s	remaining: 22.3s
1300:	test: 0.9550359	best: 0.9550359 (1300)	total:

In [543]:
model.feature_importances_

array([ 5.0572428 ,  4.44282262, 16.86103447,  6.09809877,  7.98568046,
        7.69773675,  6.78972031,  8.64950592,  8.19359677,  7.09281682,
        6.20103142,  7.26873953,  7.66197336])

In [371]:
def findOccurrences(s, ch):
    return [i for i, letter in enumerate(s) if letter == ch]

In [544]:
def processLine(line):
    words = line.split()
    features = []
    ans = ""
    if len(line) < 2:
        print('OK')
        return
    for word in words:
        try:
            if 'РБК' in word:
                pos = line.find(word)
                ans += str(pos) + ' ' + str(len(word)) + ' ORG '
                continue
            if re.search('[a-zA-Z]', word) is not None:
                pos = line.find(word)
                ans += str(pos) + ' ' + str(len(word)) + ' ORG '
                continue
            word = word.replace(',', '').replace('.', '').replace('?', '').replace('!', '').replace('\n', '')
            pred_class = model.predict(analyzeWord(word, isUsed))
            if pred_class[0] == 'ORG':
                pos = line.find(word)
                ans += str(pos) + ' ' + str(len(word)) + ' ORG '
            if pred_class[0] == 'PER':
                pos = line.find(word)
                ans += str(pos) + ' ' + str(len(word)) + ' PERSON  '  
        except Exception as e:
            #print(word)
            pass
    with open('out.txt', 'a') as f:
        f.write(ans + 'EOL\n')

In [545]:
def processTest():
    with open('dataset_40163_1.txt') as f:
        [processLine(line) for line in f]

In [556]:
import os

m = Mystem()
try:
    os.remove('out.txt')
except: pass
processTest()