In [1]:
%run header.ipynb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

adv_pattern = re.compile('\d+\s?(—Ä(\W|$)|—Ä—É–±|rub|\$|—Ç–Ω–≥|—Ç–µ–Ω–≥–µ|–≥—Ä–Ω|–≥—Ä–∏–≤–µ–Ω)|–ø—Ä–æ–¥–∞[–µ—ë]—Ç—Å—è|–ø—Ä–æ–¥–∞[–º—é]', flags=re.UNICODE | re.IGNORECASE)
assert adv_pattern.search('–¶–µ–Ω–∞ 1000 —Ç–Ω–≥.')
assert adv_pattern.search('2500 —Ä—É–±')
assert adv_pattern.search('800 –†–£–ë–õ–ï–ô! ')
assert adv_pattern.search('2700—Ä 22')
assert not adv_pattern.search('2700 —Ä–π–π–π')
assert adv_pattern.search('–ü—Ä–æ–¥–∞–º –∑–∏–ª-–∫–∞–º–∞–∑ 3500—Ä')
assert adv_pattern.search('–ü—Ä–æ–¥–∞—é –Ω–æ–≤—ã–π –¥–æ–º –≤ –ö—Ä—ã–º—É!!!!')
assert adv_pattern.search('–ü—Ä–æ–¥–∞–µ—Ç—Å—è –¥–æ–º –≤ —Ä-–Ω–µ')
assert adv_pattern.search('–ü—Ä–æ–¥–∞—ë—Ç—Å—è –∫–≤–∞—Ä—Ç–∏—Ä–∞')
    
def is_adv(string):
    return 1 if adv_pattern.search(string) else 0

phone_pattern = re.compile('(^|\W)(\+7|8)(-?(\s?\()?(\)\s?)?\s?\d){10}(\W|$)', flags=re.UNICODE | re.IGNORECASE)
assert phone_pattern.search(' —Ç–µ–ª.89144547633')
#assert phone_pattern.search('–û–±—Ä–∞—â–∞—Ç—å—Å—è –ø–æ —Ç–µ–ª. 0554106990.') # TODO
assert phone_pattern.search('8-913-03-555-99')
assert phone_pattern.search('8-924-500-8701')
assert phone_pattern.search('89304071675')
assert phone_pattern.search('8-908-659-50-35')
assert phone_pattern.search('+7 (3842) 36-99-83')
assert phone_pattern.search('+79787824200')
assert phone_pattern.search('8-918-090-73-28')
assert phone_pattern.search('89098589641')
assert phone_pattern.search('89098589641.')
assert phone_pattern.search('89098589641 ')
assert phone_pattern.search('—Ç.89098589641')
assert phone_pattern.search('89024058769')
assert phone_pattern.search('—Ç. 89881370418')
assert phone_pattern.search('–∑–≤–æ–Ω–∏—Ç—å –ø–æ —Ç–µ–ª–µ—Ñ–æ–Ω—É 89004176707')
assert phone_pattern.search('8-777-571-70-54')
assert not phone_pattern.search('89024058769777')
assert not phone_pattern.search('8889024058769')
#assert phone_pattern.search('095-247-04-64') # TODO

def has_phone(string):
    return 1 if phone_pattern.search(string) else 0

recipe_pattern = re.compile('\d+\s?(–º–ª\W|–≥—Ä\W|—á\.\s?–ª\W|—Å—Ç\.\s?–ª\W)|–∏–Ω–≥—Ä–µ–¥–∏–µ–Ω—Ç—ã|—Ä–µ—Ü–µ–ø—Ç', flags=re.UNICODE | re.IGNORECASE)
assert recipe_pattern.search('100 –º–ª \n')
assert recipe_pattern.search('0,5 —á. –ª\n')
assert recipe_pattern.search('1 —Å—Ç.–ª\n')
assert recipe_pattern.search('—Å–ª–æ—ë–Ω–æ–µ —Ç–µ—Å—Ç–æ 400 –≥—Ä., ')

def is_recipe(string):
    return 1 if recipe_pattern.search(string) else 0

def remove_emoji(string):
    return emoji_pattern.sub(r'', string) # e.g: üòäüòâüòâüëá„ÉÖ„Ç∏

def create_text_features0(texts, result, doc2vec):
    result['embedding'] = texts.preprocessed.apply(doc2vec.infer_vector)

def qwe(x):
    return x.preprocessed.apply(doc2vec.infer_vector)
    
def create_text_features0_batched(texts, doc2vec):
    cpus = int(cpu_count() / 2)
    batch_size = int(texts.shape[0] / cpus + 1)
    batches = [texts.loc[x:(x + batch_size - 1),] for x in range(0, texts.shape[0], batch_size)]
    print('Batches count: %d' % len(batches))
    with Pool(cpus) as p:
        ret = p.map(qwe, batches)
    return pd.concat(ret)
    
def qwe2(x, doc2vec):
    return x.preprocessed.apply(doc2vec.infer_vector)
    
def create_text_features0_batched2(texts, doc2vec):
    batch_size = 5
    batches = [(texts.loc[x:(x + batch_size - 1),], doc2vec) for x in range(0, texts.shape[0], batch_size)]
    print('Batches count: %d' % len(batches))
    with Pool(int(cpu_count() / 2)) as p:
        ret = p.map(qwe2, batches)
    return pd.concat(ret)

    
def create_text_features1(texts, result):
    result['objectId'] = texts.objectId
    result['lang'] = texts.lang
    result['len'] = texts.text.apply(len)
    result['p_len'] = texts.preprocessed.apply(len)
    result['q_count'] = texts.text.str.count('\?')
#        'sentances_count': texts.text.apply(lambda s: s.count('?'))
    result['upper_count'] = texts.text.apply(lambda s: np.char.isupper(list(s)).sum())
    result['emojis_count'] = texts.text.apply(lambda s: len(s) - len(remove_emoji(s)))
    
def create_text_features2(texts, result):
    result['youtube_count'] = texts.text.str.count('youtu\.be|youtube\.com')
    result['links_count'] = texts.text.str.count('http')
    result['ok_videos_count'] = texts.text.str.count('ok\.ru/video')
    result['ok_groups_count'] = texts.text.str.count('ok\.ru/group')
    
def create_text_features3(texts, result):
    result['hashes_count'] = texts.text.str.count('#')
    result['quotes_count'] = texts.text.str.count('¬´')
    result['mdots_count'] = texts.text.str.count('\.\.')
    result['brackets_balance'] = (texts.text.str.count('\(') - texts.text.str.count('\)')).abs()
    result['is_adv'] = texts.text.apply(is_adv)
    result['is_recipe'] = texts.text.apply(is_recipe)
    
def create_text_features4(texts, result):
    result['has_phone'] = texts.text.apply(has_phone)
    result['is_poll'] = texts.text.str.endswith('–û–ø—Ä–æ—Å—ã')
    result['e_count'] = texts.text.str.count('!')
    
    
def create_text_features(texts, doc2vec):
    result = pd.DataFrame({})
    print('create_text_features 0...')
    create_text_features0(texts, result, doc2vec)
    print('create_text_features 1...')
    create_text_features1(texts, result)
    print('create_text_features 2...')
    create_text_features2(texts, result)
    print('create_text_features 3...')
    create_text_features3(texts, result)
    print('create_text_features 4...')
    create_text_features4(texts, result)
    return result

In [3]:
test_texts = parquet.read_table(input_path + '/texts/textsTest/').to_pandas()

  labels, = index.labels


In [4]:
train_texts = parquet.read_table(input_path + '/texts/textsTrain').to_pandas()

In [6]:
doc2vec = Doc2Vec([TaggedDocument(lines,'tag') for lines in test_texts.preprocessed] + 
                  [TaggedDocument(lines,'tag') for lines in train_texts.preprocessed],
                dm=0, vector_size=15, window=5, min_count=2, workers=8)
doc2vec.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
doc2vec.save(output_path + '/doc2vec_all_0_15_5_2')
#doc2vec = Doc2Vec.load(output_path + '/doc2vec_all_23_5_2')

In [54]:
if False:
    train_texts_features = pd.read_pickle(output_path + '/train_text_features2')
    train_texts_features['embedding'] = train_texts.preprocessed.apply(doc2vec.infer_vector)
    train_texts_features.to_pickle(output_path + '/train_text_features3')
    test_texts_features = pd.read_pickle(output_path + '/test_text_features2')
    test_texts_features['embedding'] = test_texts.preprocessed.apply(doc2vec.infer_vector)
    test_texts_features.to_pickle(output_path + '/test_text_features3')
if True:
    test_texts_features = pd.read_pickle(output_path + '/test_text_features4')
    create_text_features4(test_texts, test_texts_features)
    test_texts_features.to_pickle(output_path + '/test_text_features5')
    train_texts_features = pd.read_pickle(output_path + '/train_text_features4')
    create_text_features4(train_texts, train_texts_features)
    train_texts_features.to_pickle(output_path + '/train_text_features5')

In [70]:
%%time
train_text_features = create_text_features(train_texts, doc2vec)

create_text_features 0...
create_text_features 1...
create_text_features 2...
create_text_features 3...


In [71]:
%%time
train_text_features.to_pickle(output_path + '/train_text_features')

CPU times: user 22.5 s, sys: 12.4 s, total: 34.9 s
Wall time: 42.1 s


In [73]:
del train_text_features
del train_texts

In [75]:
%%time
test_text_features = create_text_features(test_texts, doc2vec)

create_text_features 0...
create_text_features 1...
create_text_features 2...
create_text_features 3...
CPU times: user 11min 34s, sys: 8.25 s, total: 11min 42s
Wall time: 11min 47s


In [76]:
%%time
test_text_features.to_pickle(output_path + '/test_text_features')

CPU times: user 3.67 s, sys: 394 ms, total: 4.06 s
Wall time: 4.07 s


In [5]:
for i in range(10400, 10600):
    t = test_texts.loc[i,:].text
    print('\n\n---------------------- %d    %d %d ----------\n\n' % (i, is_adv(t), is_recipe(t)))
    print(t)



---------------------- 10400    0 0 ----------


–ö–∞–∫ –æ–±—É—Å—Ç—Ä–æ–∏—Ç—å –≥–∞—Ä–¥–µ—Ä–æ–±–Ω—É—é: 21 –∏–¥–µ—è –¥–ª—è –æ–±—ã—á–Ω—ã—Ö –∏ —Å–∞–º—ã—Ö –º–∞–ª–µ–Ω—å–∫–∏—Ö –∫–æ–º–Ω–∞—Ç‚Ä¶ http://sondor.ru/kak-obystroit-garderobnyu-21-ideia-dlia-obychnyh-i-samyh-malenkih-komnat.html –ì–ª–∞–≤–Ω–æ–µ, —á—Ç–æ –±—Ä–æ—Å–∞–µ—Ç—Å—è –Ω–∞–º –≤¬†–≥–ª–∞–∑–∞ –ø—Ä–∏ –ø—Ä–æ—Å–º–æ—Ç—Ä–µ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã—Ö —Ñ–∏–ª—å–º–æ–≤¬†‚Äî —Ñ–∞–Ω—Ç–∞—Å—Ç–∏—á–µ—Å–∫–∞—è –≥–∞—Ä–¥–µ—Ä–æ–±–Ω–∞—è –∫–æ–º–Ω–∞—Ç–∞ –≤¬†–∫–≤–∞—Ä—Ç–∏—Ä–µ. –ò¬†–∫–∞–∂–¥–∞—è –∂–µ–Ω—â–∏–Ω–∞ –Ω–∞—á–∏–Ω–∞–µ—Ç –¥—É–º–∞—Ç—å¬†‚Äî ¬´—Ö–æ—á—É... –ö–∞–∫ –æ–±—É—Å—Ç—Ä–æ–∏—Ç—å –≥–∞—Ä–¥–µ—Ä–æ–±–Ω—É—é: 21 –∏–¥–µ—è –¥–ª—è –æ–±—ã—á–Ω—ã—Ö –∏ —Å–∞–º—ã—Ö –º–∞–ª–µ–Ω—å–∫–∏—Ö –∫–æ–º–Ω–∞—Ç‚Ä¶ | –ó–∞—Ä—è–¥–∏—Å—å –ø–æ–∑–∏—Ç–∏–≤–æ–º


---------------------- 10401    0 0 ----------


Doi bƒÉrba»õi au ajuns √Æn vizorul ofi»õerilor »ôi procurorilor anticorup»õie, fiind bƒÉnui»õi de trafic de influen»õƒÉ. Ace»ôtia ar fi pretins »ôi primit de mijloace financiare ce nu li se cu

In [7]:
train_texts.shape

(3410916, 4)

In [11]:
train_texts.objectId.unique().shape

(3352714,)

In [41]:
train_texts.head(100000).text.str.count('\.(mpg|mp4|mp3)').sum()

207

In [7]:
train_texts

Unnamed: 0,objectId,lang,text,preprocessed
0,11181946,ru,"–ü–∏—Ç–∫–µ—Ä–Ω–∏—è\r\n\r\n–û—á–µ–Ω—å –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ–µ —Ä–∞—Å—Ç–µ–Ω–∏–µ, –ø—Ä...","[–ø–∏—Ç–∫–µ—Ä–Ω, –æ—á–µ–Ω, –∏–Ω—Ç–µ—Ä–µ—Å–Ω, —Ä–∞—Å—Ç–µ–Ω, –ø—Ä–æ–∏–∑—Ä–∞—Å—Ç–∞, ..."
1,12040268,Unknown,"–Ø—Ö—Ç—ã, –æ–ª–∏–≥–∞—Ä—Ö–∏, –ø—Ä–æ—Å—Ç–∏—Ç—É—Ç–∫–∏: —Å–µ–∫—Å-–æ—Ö–æ—Ç–Ω–∏—Ü–∞ —Ä–∞–∑...","[—è—Ö—Ç, –æ–ª–∏–≥–∞—Ä—Ö, –ø—Ä–æ—Å—Ç–∏—Ç—É—Ç–∫, —Å–µ–∫—Å, –æ—Ö–æ—Ç–Ω–∏—Ü, —Ä–∞–∑–æ..."
2,14050867,ru,"–ö—Ç–æ-—Ç–æ –≥–∏–±–Ω–µ—Ç –≤ –±–æ—é, –ø–æ–¥—Ä—ã–≤–∞—è —Å–µ–±—è –≥—Ä–∞–Ω–∞—Ç–æ–π, –∞...","[–≥–∏–±–Ω–µ—Ç, –±–æ, –ø–æ–¥—Ä—ã–≤, –≥—Ä–∞–Ω–∞—Ç, –∫–æ–≥, –≤–µ–¥—É—Ç, –±–æ, –≤..."
3,17023591,ru,–û—Ç–Ω–æ—à–µ–Ω–∏—è: –∫–æ–≥–¥–∞ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç –≤—ã–±–æ—Ä? –°–≤—è—Ç–æ—Å–ª–∞–≤ –†...,"[–æ—Ç–Ω–æ—à–µ–Ω, –ø—Ä–æ–∏—Å—Ö–æ–¥, –≤—ã–±–æ—Ä, —Å–≤—è—Ç–æ—Å–ª–∞, —Ä–∞–π–∫, —á–∏—Ç..."
4,18389833,Unknown,ok.ru/group/51094392012955 ok.ru/giflive ok.ru...,[]
5,24344214,ru,–§–ö ¬´–†–æ—Ç–æ—Ä-–í–æ–ª–≥–æ–≥—Ä–∞–¥¬ª - ¬´–°–ø–∞—Ä—Ç–∞–∫-2¬ª (–ú–æ—Å–∫–≤–∞) 1:...,"[—Ñ–∫, —Ä–æ—Ç–æ—Ä, –≤–æ–ª–≥–æ–≥—Ä–∞–¥, —Å–ø–∞—Ä—Ç–∞–∫, –º–æ—Å–∫–≤, –≥–æ–ª, –∞–ø..."
6,24544853,ru,–ò. –ê–Ω–∏—Å–∏–º–æ–≤ - –ò—Å—Ç–æ—Ä–∏—è –≤–∑—è—Ç–∞—è –∏–∑ –∂–∏–∑–Ω–∏ (—Å—Ç–∏—Ö–∏) ...,"[–∞–Ω–∏—Å–∏–º, –∏—Å—Ç–æ—Ä, –≤–∑—è—Ç, –∂–∏–∑–Ω, —Å—Ç–∏—Ö]"
7,24583002,ru,"–ö–∞–∫ –±—ã –≤—ã–≥–ª—è–¥–µ–ª–∏ —Ö–∏–º–∏—á–µ—Å–∫–∏–µ —ç–ª–µ–º–µ–Ω—Ç—ã, –µ—Å–ª–∏ –±—ã ...","[–≤—ã–≥–ª—è–¥–µ–ª, —Ö–∏–º–∏—á–µ—Å–∫, —ç–ª–µ–º–µ–Ω—Ç, –ª—é–¥—å–º]"
8,26053581,ru,–û—á–µ–Ω—å —Å—Ç—Ä–∞–Ω–Ω—ã–µ –¥–µ–ª–∞ –ø—Ä–æ–∏—Å—Ö–æ–¥—è—Ç —Å –¢–æ–º–æ–º –•–∞—Ä–¥–∏ –≤...,"[–æ—á–µ–Ω, —Å—Ç—Ä–∞–Ω, –¥–µ–ª, –ø—Ä–æ–∏—Å—Ö–æ–¥, —Ç–æ–º, —Ö–∞—Ä–¥, –ø–µ—Ä–≤, ..."
9,26853081,ru,–î–ï–†–ï–í–Ø–ù–ù–ê–Ø –õ–ï–°–¢–ù–ò–¶–ê - –∏–∑ –Ω–∞—Ç—É—Ä–∞–ª—å–Ω–æ–≥–æ –¥–µ—Ä–µ–≤–∞ ...,"[–¥–µ—Ä–µ–≤—è, –ª–µ—Å—Ç–Ω–∏—Ü, –Ω–∞—Ç—É—Ä–∞–ª—å–Ω, –¥–µ—Ä–µ–≤, –æ—Å—Ç–∞—ë—Ç, –ø—Ä..."


In [14]:
train_texts[['objectId', 'lang']].groupby('objectId').agg({'lang': 'count'})['lang'].reset_index() \
    .sort_values(['lang'], ascending=[False])

Unnamed: 0,objectId,lang
31766,487941,277
42767,556010,255
6477,339044,185
41284,546672,106
2,1283,73
44768,568009,59
41877,550628,55
23134,437460,55
1,677,41
1343547,18229844,33


In [11]:
a = train_texts[train_texts.objectId == 19465260]

In [12]:
for i in range(a.shape[0]):
    print(a.iloc[i,:].text)
    print('----------------------------------------------')

Scorpions- "Is There Anybody There" 1979 TV 50955946725 https://youtu.be/R49lfk9Y2b0
----------------------------------------------


In [21]:
train_texts[train_texts.objectId == 1195330]

Unnamed: 0,objectId,lang,text,preprocessed
165288,1195330,Unknown,160—Ä—É–±. - –ö–û–õ–¨–¶–û. –û–°–¢–ê–¢–ö–ò –†–ê–ó–ú–ï–†–û–í:\n3–º–º: 18.8...,"[—Ä—É–±, –∫–æ–ª—å—Ü, –æ—Å—Ç–∞—Ç–∫, —Ä–∞–∑–º–µ—Ä, –º–º, –º–º, xuping, –ø..."
613512,1195330,Unknown,160—Ä—É–±. - –ö–û–õ–¨–¶–û. –û–°–¢–ê–¢–ö–ò –†–ê–ó–ú–ï–†–û–í:\n3–º–º: 18(1...,"[—Ä—É–±, –∫–æ–ª—å—Ü, –æ—Å—Ç–∞—Ç–∫, —Ä–∞–∑–º–µ—Ä, –º–º, –º–º, xuping, –ø..."
646510,1195330,Unknown,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1106002,1195330,Unknown,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1115746,1195330,Unknown,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1605411,1195330,ru,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1648239,1195330,ru,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1691453,1195330,Unknown,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1708836,1195330,Unknown,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1757533,1195330,ru,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
