In [1]:
%run header.ipynb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [5]:
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

adv_pattern = re.compile('\d+\s?(р(\W|$)|руб|rub|\$|тнг|тенге|грн|гривен)', flags=re.UNICODE | re.IGNORECASE)
assert adv_pattern.search('Цена 1000 тнг.')
assert adv_pattern.search('2500 руб')
assert adv_pattern.search('800 РУБЛЕЙ! ')
assert adv_pattern.search('2700р 22')
assert not adv_pattern.search('2700 рййй')
assert adv_pattern.search('Продам зил-камаз 3500р')
    
def is_adv(string):
    return 1 if adv_pattern.search(string) else 0

recipe_pattern = re.compile('\d+\s?(мл\W|гр\W|ч\.\s?л\W|ст\.\s?л\W)|ингредиенты', flags=re.UNICODE | re.IGNORECASE)
assert recipe_pattern.search('100 мл \n')
assert recipe_pattern.search('0,5 ч. л\n')
assert recipe_pattern.search('1 ст.л\n')
assert recipe_pattern.search('слоёное тесто 400 гр., ')

def is_recipe(string):
    return 1 if len(string) > 300 and recipe_pattern.search(string) else 0

def remove_emoji(string):
    return emoji_pattern.sub(r'', string) # e.g: 😊😉😉👇ヅジ

def create_text_features0(texts, result, doc2vec):
    result['embedding'] = texts.preprocessed.apply(doc2vec.infer_vector)

def qwe(x):
    return x.preprocessed.apply(doc2vec.infer_vector)
    
def create_text_features0_batched(texts, doc2vec):
    cpus = int(cpu_count() / 2)
    batch_size = int(texts.shape[0] / cpus + 1)
    batches = [texts.loc[x:(x + batch_size - 1),] for x in range(0, texts.shape[0], batch_size)]
    print('Batches count: %d' % len(batches))
    with Pool(cpus) as p:
        ret = p.map(qwe, batches)
    return pd.concat(ret)
    
def qwe2(x, doc2vec):
    return x.preprocessed.apply(doc2vec.infer_vector)
    
def create_text_features0_batched2(texts, doc2vec):
    batch_size = 5
    batches = [(texts.loc[x:(x + batch_size - 1),], doc2vec) for x in range(0, texts.shape[0], batch_size)]
    print('Batches count: %d' % len(batches))
    with Pool(int(cpu_count() / 2)) as p:
        ret = p.map(qwe2, batches)
    return pd.concat(ret)

    
def create_text_features1(texts, result):
    result['objectId'] = texts.objectId
    result['lang'] = texts.lang
    result['len'] = texts.text.apply(len)
    result['p_len'] = texts.preprocessed.apply(len)
    result['q_count'] = texts.text.str.count('\?')
#        'sentances_count': texts.text.apply(lambda s: s.count('?'))
    result['upper_count'] = texts.text.apply(lambda s: np.char.isupper(list(s)).sum())
    result['emojis_count'] = texts.text.apply(lambda s: len(s) - len(remove_emoji(s)))
    
def create_text_features2(texts, result):
    result['youtube_count'] = texts.text.str.count('youtu\.be|youtube\.com')
    result['links_count'] = texts.text.str.count('http')
    result['ok_videos_count'] = texts.text.str.count('ok\.ru/video')
    result['ok_groups_count'] = texts.text.str.count('ok\.ru/group')
    
def create_text_features3(texts, result):
    result['is_adv'] = texts.text.apply(is_adv)
    result['is_recipe'] = texts.text.apply(is_recipe)
    
def create_text_features(texts, doc2vec):
    result = pd.DataFrame({})
    print('create_text_features 0...')
    create_text_features0(texts, result, doc2vec)
    print('create_text_features 1...')
    create_text_features1(texts, result)
    print('create_text_features 2...')
    create_text_features2(texts, result)
    print('create_text_features 3...')
    create_text_features3(texts, result)
    return result

In [3]:
test_texts = parquet.read_table(input_path + '/texts/textsTest/').to_pandas()

  labels, = index.labels


In [4]:
train_texts = parquet.read_table(input_path + '/texts/textsTrain').to_pandas()

In [6]:
doc2vec = Doc2Vec([TaggedDocument(lines,'tag') for lines in test_texts.preprocessed] + 
                  [TaggedDocument(lines,'tag') for lines in train_texts.preprocessed],
                dm=0, vector_size=15, window=5, min_count=2, workers=8)
doc2vec.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
doc2vec.save(output_path + '/doc2vec_all_0_15_5_2')
#doc2vec = Doc2Vec.load(output_path + '/doc2vec_all_23_5_2')

In [15]:
if True:
    ss = train_texts.head(16)
    ee = create_text_features0_batched(ss, doc2vec)

Batches count: 4


In [16]:
ee

0     [-0.078174375, 0.030068325, 0.56915706, 0.3946...
1     [0.017158048, -0.0030374476, 0.40244743, 0.064...
2     [-0.11980743, 0.033034455, 0.2846151, -0.07599...
3     [-0.062238418, 0.108684815, 0.6980249, 0.15739...
4     [0.004067792, 0.017932447, 0.008563614, 0.0037...
5     [-0.08109753, 0.020276783, 1.1092168, -0.30892...
6     [0.014924731, -0.025568401, -0.030348564, 0.02...
7     [-0.06420441, -0.034833983, 0.07442447, 0.0956...
8     [-0.07508321, -0.06523436, 0.3909412, 0.084444...
9     [-0.11827853, -0.025395041, 0.363615, 0.167166...
10    [-0.017185628, 0.15236217, 0.83913946, 0.01233...
11    [-0.016982712, -0.008890788, 0.12905064, -0.00...
12    [-0.20114426, -0.26288527, 0.332214, 0.5784132...
13    [-0.07853914, -0.24068944, 1.440104, -0.001245...
14    [-0.0030433275, -0.009110031, 0.1449405, -0.03...
15    [-0.027161788, 0.04473272, 0.23370464, 0.10221...
Name: preprocessed, dtype: object

In [17]:
train_texts.shape

(3410916, 4)

In [18]:
if True:
    train_texts_features = pd.read_pickle(output_path + '/train_text_features2')
    train_texts_features['embedding'] = create_text_features0_batched(train_texts, doc2vec)
    train_texts_features.to_pickle(output_path + '/train_text_features3')
    test_texts_features = pd.read_pickle(output_path + '/test_text_features2')
    test_texts_features['embedding'] = create_text_features0_batched(test_texts, doc2vec)
    test_texts_features.to_pickle(output_path + '/test_text_features3')    

Batches count: 4


Process ForkPoolWorker-12:
Process ForkPoolWorker-11:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/usr/lo

KeyboardInterrupt: 

In [7]:
if True:
    train_texts_features = pd.read_pickle(output_path + '/train_text_features2')
    train_texts_features['embedding'] = train_texts.preprocessed.apply(doc2vec.infer_vector)
    train_texts_features.to_pickle(output_path + '/train_text_features3')
    test_texts_features = pd.read_pickle(output_path + '/test_text_features2')
    test_texts_features['embedding'] = test_texts.preprocessed.apply(doc2vec.infer_vector)
    test_texts_features.to_pickle(output_path + '/test_text_features3')

In [70]:
%%time
train_text_features = create_text_features(train_texts, doc2vec)

create_text_features 0...
create_text_features 1...
create_text_features 2...
create_text_features 3...


In [71]:
%%time
train_text_features.to_pickle(output_path + '/train_text_features')

CPU times: user 22.5 s, sys: 12.4 s, total: 34.9 s
Wall time: 42.1 s


In [73]:
del train_text_features
del train_texts

In [75]:
%%time
test_text_features = create_text_features(test_texts, doc2vec)

create_text_features 0...
create_text_features 1...
create_text_features 2...
create_text_features 3...
CPU times: user 11min 34s, sys: 8.25 s, total: 11min 42s
Wall time: 11min 47s


In [76]:
%%time
test_text_features.to_pickle(output_path + '/test_text_features')

CPU times: user 3.67 s, sys: 394 ms, total: 4.06 s
Wall time: 4.07 s


In [64]:
for i in range(200, 400):
    t = train_texts.loc[i,:].text
    print('\n\n---------------------- %d    %d %d ----------\n\n' % (i, is_adv(t), is_recipe(t)))
    print(t)



---------------------- 200    0 0 ----------


ok.ru/group/50398920507545 ok.ru/ohdaprikoliy ok.ru/group/52160294486084 ok.ru/group52160294486084 ok.ru/group/52170409640018 ok.ru/domkuhna ok.ru/group/52127890931891 ok.ru/diaary.happiness ok.ru/dizaynsvoimi ok.ru/group/52845864288376


---------------------- 201    0 0 ----------


Kadına saygılı ol. 
Çünkü O insanoğlunun anasıdır. 
Hz Ali..(r.a)
Kadınlar günümüz kutlu olsun ..!!!


---------------------- 202    0 0 ----------


Изготовления дверей, ворот, кованые изделия и сварочные работы полу автоматом. тел. 89043003257 Алексей


---------------------- 203    0 0 ----------


ok.ru/group/52805420581093 ok.ru/udivinever BMW ///M5  M6 ok.ru/vkusnoti4e ok.ru/group/52860568600696 ok.ru/taynyizag ok.ru/group/51928248877197 ok.ru/group/54129214095360 ok.ru/stranafaktov ok.ru/group/52466256183382 ok.ru/funcats


---------------------- 204    0 0 ----------


ok.ru/group/51838555848919 ok.ru/flatme ok.ru/group/52466256183382 ok.ru/funcats 