In [1]:
%run header.ipynb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [2]:
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

adv_pattern = re.compile('\d+\s?(—Ä(\W|$)|—Ä—É–±|rub|\$|—Ç–Ω–≥|—Ç–µ–Ω–≥–µ|–≥—Ä–Ω|–≥—Ä–∏–≤–µ–Ω)|–ø—Ä–æ–¥–∞[–µ—ë]—Ç—Å—è|–ø—Ä–æ–¥–∞[–º—é]', flags=re.UNICODE | re.IGNORECASE)
assert adv_pattern.search('–¶–µ–Ω–∞ 1000 —Ç–Ω–≥.')
assert adv_pattern.search('2500 —Ä—É–±')
assert adv_pattern.search('800 –†–£–ë–õ–ï–ô! ')
assert adv_pattern.search('2700—Ä 22')
assert not adv_pattern.search('2700 —Ä–π–π–π')
assert adv_pattern.search('–ü—Ä–æ–¥–∞–º –∑–∏–ª-–∫–∞–º–∞–∑ 3500—Ä')
assert adv_pattern.search('–ü—Ä–æ–¥–∞—é –Ω–æ–≤—ã–π –¥–æ–º –≤ –ö—Ä—ã–º—É!!!!')
assert adv_pattern.search('–ü—Ä–æ–¥–∞–µ—Ç—Å—è –¥–æ–º –≤ —Ä-–Ω–µ')
assert adv_pattern.search('–ü—Ä–æ–¥–∞—ë—Ç—Å—è –∫–≤–∞—Ä—Ç–∏—Ä–∞')
    
def is_adv(string):
    return 1 if adv_pattern.search(string) else 0

phone_pattern = re.compile('(^|\W)(\+7|8)(-?(\s?\()?(\)\s?)?\s?\d){10}(\W|$)', flags=re.UNICODE | re.IGNORECASE)
assert phone_pattern.search(' —Ç–µ–ª.89144547633')
#assert phone_pattern.search('–û–±—Ä–∞—â–∞—Ç—å—Å—è –ø–æ —Ç–µ–ª. 0554106990.') # TODO
assert phone_pattern.search('8-913-03-555-99')
assert phone_pattern.search('8-924-500-8701')
assert phone_pattern.search('89304071675')
assert phone_pattern.search('8-908-659-50-35')
assert phone_pattern.search('+7 (3842) 36-99-83')
assert phone_pattern.search('+79787824200')
assert phone_pattern.search('8-918-090-73-28')
assert phone_pattern.search('89098589641')
assert phone_pattern.search('89098589641.')
assert phone_pattern.search('89098589641 ')
assert phone_pattern.search('—Ç.89098589641')
assert phone_pattern.search('89024058769')
assert phone_pattern.search('—Ç. 89881370418')
assert phone_pattern.search('–∑–≤–æ–Ω–∏—Ç—å –ø–æ —Ç–µ–ª–µ—Ñ–æ–Ω—É 89004176707')
assert phone_pattern.search('8-777-571-70-54')
assert not phone_pattern.search('89024058769777')
assert not phone_pattern.search('8889024058769')
#assert phone_pattern.search('095-247-04-64') # TODO

def has_phone(string):
    return 1 if phone_pattern.search(string) else 0

recipe_pattern = re.compile('\d+\s?(–º–ª|–≥—Ä|—á\.\s?–ª|—Å—Ç\.\s?–ª|—Å—Ç–∞–∫–∞–Ω|—Å—Ç–∞–∫–∞–Ω–∞|–≥—Ä–∞–º–º)\W|–∏–Ω–≥—Ä–µ–¥–∏–µ–Ω—Ç—ã|—Ä–µ—Ü–µ–ø—Ç', flags=re.UNICODE | re.IGNORECASE)
assert recipe_pattern.search('100 –º–ª \n')
assert recipe_pattern.search('0,5 —á. –ª\n')
assert recipe_pattern.search('1 —Å—Ç.–ª\n')
assert recipe_pattern.search('—Å–ª–æ—ë–Ω–æ–µ —Ç–µ—Å—Ç–æ 400 –≥—Ä., ')
assert recipe_pattern.search('5 —Å—Ç–∞–∫–∞–Ω–∞ ')
assert recipe_pattern.search('5 –≥—Ä–∞–º–º ')

def is_recipe(string):
    return 1 if recipe_pattern.search(string) else 0

def remove_emoji(string):
    return emoji_pattern.sub(r'', string) # e.g: üòäüòâüòâüëá„ÉÖ„Ç∏

def create_text_features0(texts, result, doc2vec):
    result['embedding'] = texts.preprocessed.apply(doc2vec.infer_vector)

def qwe(x):
    return x.preprocessed.apply(doc2vec.infer_vector)
    
def create_text_features0_batched(texts, doc2vec):
    cpus = int(cpu_count() / 2)
    batch_size = int(texts.shape[0] / cpus + 1)
    batches = [texts.loc[x:(x + batch_size - 1),] for x in range(0, texts.shape[0], batch_size)]
    print('Batches count: %d' % len(batches))
    with Pool(cpus) as p:
        ret = p.map(qwe, batches)
    return pd.concat(ret)
    
def qwe2(x, doc2vec):
    return x.preprocessed.apply(doc2vec.infer_vector)
    
def create_text_features0_batched2(texts, doc2vec):
    batch_size = 5
    batches = [(texts.loc[x:(x + batch_size - 1),], doc2vec) for x in range(0, texts.shape[0], batch_size)]
    print('Batches count: %d' % len(batches))
    with Pool(int(cpu_count() / 2)) as p:
        ret = p.map(qwe2, batches)
    return pd.concat(ret)

    
def create_text_features1(texts, result):
    result['objectId'] = texts.objectId
    result['lang'] = texts.lang
    result['len'] = texts.text.apply(len)
    result['p_len'] = texts.preprocessed.apply(len)
    result['q_count'] = texts.text.str.count('\?')
#        'sentances_count': texts.text.apply(lambda s: s.count('?'))
    result['upper_count'] = texts.text.apply(lambda s: np.char.isupper(list(s)).sum())
    result['emojis_count'] = texts.text.apply(lambda s: len(s) - len(remove_emoji(s)))
    
def create_text_features2(texts, result):
    result['youtube_count'] = texts.text.str.count('youtu\.be|youtube\.com')
    result['links_count'] = texts.text.str.count('http')
    result['ok_videos_count'] = texts.text.str.count('ok\.ru/video')
    result['ok_groups_count'] = texts.text.str.count('ok\.ru/group')
    
def create_text_features3(texts, result):
    result['hashes_count'] = texts.text.str.count('#')
    result['quotes_count'] = texts.text.str.count('¬´')
    result['mdots_count'] = texts.text.str.count('\.\.')
    result['brackets_balance'] = (texts.text.str.count('\(') - texts.text.str.count('\)')).abs()
    result['is_adv'] = texts.text.apply(is_adv)
    result['is_recipe'] = texts.text.apply(is_recipe)
    
def create_text_features4(texts, result):
    result['has_phone'] = texts.text.apply(has_phone)
    result['is_poll'] = texts.text.str.endswith('–û–ø—Ä–æ—Å—ã')
    result['e_count'] = texts.text.str.count('!')
    result['lines_count'] = texts.text.str.count('\n')
    result['e_end'] = texts.text.apply(lambda x: x.rstrip().endswith('!'))
    result['q_end'] = texts.text.apply(lambda x: x.rstrip().endswith('?'))
    result['d_end'] = texts.text.apply(lambda x: x.rstrip().endswith('.'))
    
def create_text_features5(texts, result):
    result['md_count'] = texts.text.str.count('—è–Ω–≤–∞—Ä|—Ñ–µ–≤—Ä–∞–ª|–º–∞—Ä—Ç|–∞–ø—Ä–µ–ª|–º–∞–π|–∏—é–Ω|–∏—é–ª|–∞–≤–≥—É—Å—Ç|—Å–µ–Ω—Ç—è–±—Ä|–æ–∫—Ç—è–±—Ä|–Ω–æ—è–±—Ä|–¥–µ–∫–∞–±—Ä' + 
                                  '|–ø–æ–Ω–µ–¥–µ–Ω—å–Ω–∏–∫|–≤—Ç–æ—Ä–Ω–∏–∫|—Å—Ä–µ–¥–∞|—Å—Ä–µ–¥—É|—á–µ—Ç–≤–µ—Ä–≥|–ø—è—Ç–Ω–∏—Ü|—Å—É–±–±–æ—Ç–∞|—Å—É–±–±–æ—Ç—É|–≤–æ—Å–∫—Ä–µ—Å–µ')
    
    
def create_text_features(texts, doc2vec):
    result = pd.DataFrame({})
    print('create_text_features 0...')
    create_text_features0(texts, result, doc2vec)
    print('create_text_features 1...')
    create_text_features1(texts, result)
    print('create_text_features 2...')
    create_text_features2(texts, result)
    print('create_text_features 3...')
    create_text_features3(texts, result)
    print('create_text_features 4...')
    create_text_features4(texts, result)
    print('create_text_features 5...')
    create_text_features5(texts, result)
    return result

In [4]:
test_texts = parquet.read_table(input_path + '/texts/textsTest/').to_pandas()
train_texts = parquet.read_table(input_path + '/texts/textsTrain').to_pandas()

  labels, = index.labels


In [3]:
train_texts_shape_0 = parquet.read_table(input_path + '/texts/textsTrain').to_pandas().shape[0]

  labels, = index.labels


In [3]:
train_texts_shape_0 = 3410916

In [5]:
train_texts_shape_0 = train_texts.shape[0]

In [6]:
%%time
vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=2)
matrix = vectorizer.fit_transform([' '.join(txt) for txt in train_texts.preprocessed] + 
                                  [' '.join(txt) for txt in test_texts.preprocessed]).astype(np.float32)
matrix

CPU times: user 4min 36s, sys: 55.8 s, total: 5min 32s
Wall time: 6min 17s


In [7]:
del train_texts
del test_texts
gc.collect()

41

In [11]:
del feats
gc.collect()

20

In [8]:
sparse.save_npz(output_path + "/sparse_1gram.npz", matrix)

In [4]:
matrix = sparse.load_npz(output_path + "/sparse_1gram.npz")

In [5]:
%%time
svder = TruncatedSVD(n_components=120, n_iter=40)
feats = svder.fit_transform(matrix)
pd.DataFrame(feats[0:5])

CPU times: user 39min 53s, sys: 4min 7s, total: 44min 1s
Wall time: 37min 32s


In [6]:
pd.DataFrame({'embeddings': list(feats[0:train_texts_shape_0])}).to_pickle(output_path + '/train_120_1_svd')
pd.DataFrame({'embeddings': list(feats[train_texts_shape_0:])}).to_pickle(output_path + '/test_120_1_svd')

In [10]:
feats.dtype

dtype('float32')

In [10]:
doc2vec = Doc2Vec([TaggedDocument(lines,'tag') for lines in test_texts.preprocessed] + 
                  [TaggedDocument(lines,'tag') for lines in train_texts.preprocessed],
                dm=0, vector_size=100, window=5, min_count=2, workers=8)
doc2vec.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
doc2vec.save(output_path + '/doc2vec_all_0_100_5_2')
#doc2vec = Doc2Vec.load(output_path + '/doc2vec_all_23_5_2')

In [11]:
train_texts_features_embedding = train_texts.preprocessed.apply(doc2vec.infer_vector)
test_texts_features_embedding = test_texts.preprocessed.apply(doc2vec.infer_vector)

In [12]:
kclusterer = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, repeats=15)
assigned_clusters3_15 = kclusterer.cluster(
    np.concatenate((train_texts_features_embedding, test_texts_features_embedding)), assign_clusters=True)
print('1')

1


In [15]:
pd.DataFrame({'assigned_clusters3_15': assigned_clusters3_15}).to_pickle(output_path + '/assigned_clusters3_15')

In [16]:
pd.DataFrame({'embeddings': train_texts_features_embedding}).to_pickle(output_path + '/train_100_embs')
pd.DataFrame({'embeddings': test_texts_features_embedding}).to_pickle(output_path + '/test_100_embs')

In [13]:
kclusterer = KMeansClusterer(6, distance=nltk.cluster.util.cosine_distance, repeats=15)
assigned_clusters6_15 = kclusterer.cluster(
    np.concatenate((train_texts_features_embedding, test_texts_features_embedding)), assign_clusters=True)
print('1')

KeyboardInterrupt: 

In [20]:
if False:
    train_texts_features = pd.read_pickle(output_path + '/train_text_features2')
    train_texts_features['embedding'] = train_texts.preprocessed.apply(doc2vec.infer_vector)
    train_texts_features.to_pickle(output_path + '/train_text_features3')
    test_texts_features = pd.read_pickle(output_path + '/test_text_features2')
    test_texts_features['embedding'] = test_texts.preprocessed.apply(doc2vec.infer_vector)
    test_texts_features.to_pickle(output_path + '/test_text_features3')
if True:
    test_texts_features = pd.read_pickle(output_path + '/test_text_features7')
    create_text_features5(test_texts, test_texts_features)
    test_texts_features.to_pickle(output_path + '/test_text_features8')
    train_texts_features = pd.read_pickle(output_path + '/train_text_features7')
    create_text_features5(train_texts, train_texts_features)
    train_texts_features.to_pickle(output_path + '/train_text_features8')

In [70]:
%%time
train_text_features = create_text_features(train_texts, doc2vec)

create_text_features 0...
create_text_features 1...
create_text_features 2...
create_text_features 3...


In [71]:
%%time
train_text_features.to_pickle(output_path + '/train_text_features')

CPU times: user 22.5 s, sys: 12.4 s, total: 34.9 s
Wall time: 42.1 s


In [73]:
del train_text_features
del train_texts

In [75]:
%%time
test_text_features = create_text_features(test_texts, doc2vec)

create_text_features 0...
create_text_features 1...
create_text_features 2...
create_text_features 3...
CPU times: user 11min 34s, sys: 8.25 s, total: 11min 42s
Wall time: 11min 47s


In [76]:
%%time
test_text_features.to_pickle(output_path + '/test_text_features')

CPU times: user 3.67 s, sys: 394 ms, total: 4.06 s
Wall time: 4.07 s


In [11]:
def qwe(x):
    x = x.rstrip()
    return x.endswith('.') or x.endswith('?') or x.endswith('?')

In [13]:
test_texts.text.apply(qwe).mean()

0.18356570857106072

In [15]:
test_texts.text.apply(lambda x: x.rstrip().endswith('!')).mean()

0.08159507273671808

In [5]:
test_texts

Unnamed: 0,objectId,lang,text,preprocessed
0,517288,ru,–ö–≤–∞—Ä—Ç–∏—Ä–Ω–∏–∫ –ù–¢–í —É –ú–∞—Ä–≥—É–ª–∏—Å–∞_ –≥—Ä—É–ø–ø–∞ –ü–∏–ª–æ—Ç,"[–∫–≤–∞—Ä—Ç–∏—Ä–Ω–∏–∫, –Ω—Ç–≤, –º–∞—Ä–≥—É–ª–∏—Å, –≥—Ä—É–ø–ø, –ø–∏–ª–æ—Ç]"
1,9501964,ru,`–†–û–î–ò–¢–ï–õ–Ø–ú`- –û–ß–ï–ù–¨ –¢–†–û–ì–ê–¢–ï–õ–¨–ù–ê–Ø –ü–ï–°–ù–Ø!!! –ê—Ä—Ç—É—Ä...,"[—Ä–æ–¥–∏—Ç–µ–ª, –æ—á–µ–Ω, —Ç—Ä–æ–≥–∞—Ç–µ–ª—å–Ω, –ø–µ—Å–Ω, –∞—Ä—Ç—É—Ä, —Ö–∞–ª–∞—Ç]"
2,23007371,ru,–°—É—Ö–æ–π –°—É–ø–µ—Ä–¥–∂–µ—Ç-100 –æ—Ç–∫–∞–∂–µ—Ç—Å—è –æ—Ç –∑–∞–ø–∞–¥–Ω—ã—Ö –∫–æ–º–ø...,"[—Å—É—Ö, —Å—É–ø–µ—Ä–¥–∂–µ—Ç, –æ—Ç–∫–∞–∂–µ—Ç, –∑–∞–ø–∞–¥–Ω, –∫–æ–º–ø–ª–µ–∫—Ç, —Ç–µ..."
3,38353886,Unknown,"–°–≥–æ—Ä–µ–ª —Å–∞—Ä–∞–π, –≥–æ—Ä–∏ –∏ —Ö–∞—Ç–∞","[—Å–≥–æ—Ä–µ–ª, —Å–∞—Ä–∞, –≥–æ—Ä, —Ö–∞—Ç]"
4,21192138,ru,–ñ–∏–≤–æ–¥—ë—Ä—ã –æ—Ç—Ä—É–±–∏–ª–∏ –ª–∞–ø—ã —Å–æ–±–∞–∫–µ 355576121950 ok....,"[–∂–∏–≤–æ–¥—ë—Ä, –æ—Ç—Ä—É–±, –ª–∞–ø, —Å–æ–±–∞–∫]"
5,26415073,ru,ok.ru/group/53061241012348 ok.ru/interessnosti...,"[–∞–∂—É—Ä–Ω, –º–∞–∫–æ–≤, –∫—É–ª–∏—á, —Ç–∞–∫, –Ω–µ–∂–Ω, –≤–æ–∑–¥—É—à–Ω, –º—è–∫–∏..."
6,36734526,ru,ok.ru/group58228491681845 ok.ru/group/58228491...,"[–¥—Ä—É–≥, —É—Å–Ω—É–ª, –±–∞—Ä, —Ç–µ–±, –Ω—É–∂–Ω, —Ç–∞—â, –¥–æ–º, –æ—á–µ–Ω, ..."
7,8699823,ru,"–ù–∏–∫–∏—Ñ–æ—Ä–æ–≤ –¥–µ–Ω—å\n–î–∞—Ç–∞ –≤ 2018 –≥–æ–¥—É:\t\t26 –º–∞—Ä—Ç–∞,...","[–Ω–∏–∫–∏—Ñ–æ—Ä, –¥–µ–Ω, –¥–∞—Ç, –≥–æ–¥, –º–∞—Ä—Ç, –ø–æ–Ω–µ–¥–µ–ª—å–Ω–∏–∫, –¥—Ä..."
8,12236843,Unknown,ok.ru/group/52742023348461 ok.ru/group52742023...,[]
9,38393782,ru,"–î–ù–† –∏ –õ–ù–†, –Ω–æ–≤–æ—Å—Ç–∏: –í–æ–π—Å–∫–∞ –±–µ—Ä—É—Ç ¬´–≤ –∫–æ–ª—å—Ü–æ¬ª –≤ ...","[–¥–Ω—Ä, –ª–Ω—Ä, –Ω–æ–≤–æ—Å—Ç, –≤–æ–π—Å–∫, –±–µ—Ä—É—Ç, –∫–æ–ª—å—Ü, –¥–æ–Ω–±–∞—Å..."


In [11]:
for i in range(50400, 55000):
    t = test_texts.loc[i,:].text
    tags = []
    if has_phone(t):
        tags.append('phone')
    if is_adv(t):
        tags.append('adv')
    if is_recipe(t):
        tags.append('recipe')
    print('\n\n---------------------- %d    %s ----------\n\n' % (i, ' '.join(tags)))
    print(t)



---------------------- 50400     ----------


http://dushadevushki.me/2017/11/30/privychka-spat-v-obnimku/?utm_source=contentmoney&utm_medium=99Dt20qc8eBC http://dushadevushki.me/2017/11/30/privychka-spat-v-obnimku/ –ï—Å–ª–∏ —É –≤–∞—Å –ø—Ä–∏–≤—ã—á–∫–∞ —Å–ø–∞—Ç—å –æ—Ç–¥–µ–ª—å–Ω–æ, —Ç–æ –≤–∞–º —Å—Ç–æ–∏—Ç –ø–µ—Ä–µ—Å–º–æ—Ç—Ä–µ—Ç—å —Å–≤–æ–∏ –ø—Ä–∏–≤—ã—á–∫–∏. –°–ø–∞—Ç—å –≤ –æ–±–Ω–∏–º–∫—É –∏ –≥–æ–ª—ã—à–æ–º –º–æ–∂–µ—Ç –ø—Ä–∏–Ω–µ—Å—Ç–∏ –±–æ–ª—å—à—É—é –ø–æ–ª—å–∑—É –ù–∞—Å–∫–æ–ª—å–∫–æ –ø–æ–ª–µ–∑–Ω–∞ –ø—Ä–∏–≤—ã—á–∫–∞ —Å–ø–∞—Ç—å –≤ –æ–±–Ω–∏–º–∫—É?


---------------------- 50401     ----------


–¢–≤–æ—Ä—á–µ—Å–∫–∞—è –º–∞—Å—Ç–µ—Ä—Å–∫–∞—è "–ê–ù–ê". –¶–≤–µ—Ç—ã –∏–∑ —à–µ–ª–∫–∞. –†—É—á–Ω–∞—è —Ä–∞–±–æ—Ç–∞. #—Ü–≤–µ—Ç—ã #—Ü–≤–µ—Ç—ã–∏–∑—à–µ–ª–∫–∞ #—Ü–≤–µ—Ç—ã—Ä—É—á–Ω–∞—è—Ä–∞–±–æ—Ç–∞ #—Ä—É—á–Ω–∞—è—Ä–∞–±–æ—Ç–∞


---------------------- 50402    adv ----------


–ë–ª—É–∑–∞ ¬´–®–ê–ù–ï–õ–¨¬ª
–¶–≤–µ—Ç: –±–æ—Ä–¥–æ
–†–∞–∑–º–µ—Ä—ã: 50-64
–¶–µ–Ω–∞:   1700 —Ä.

–Æ–±–∫–∞ ¬´–õ–ï–ô–õ–ê¬ª
–¶–≤–µ—Ç: —á—ë—Ä–Ω—ã–π
–†–∞–∑–º–µ—Ä

–í–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –¥–∞–µ—Ç—Å—è —Ç–µ–º, –∫—Ç–æ –º–µ—á—Ç–∞–µ—Ç... –ß—É–¥–æ –¥–∞–µ—Ç—Å—è —Ç–µ–º, –∫—Ç–æ –≤–µ—Ä–∏—Ç...


---------------------- 50926     ----------


¬´***–î–æ–±—Ä—ã–π –≤–µ—á–µ—Ä!–õ—é–±–≤–∏ –∏ —Å—á–∞—Å—Ç—å—è!***¬ªhttp://www.playcast.ru/view/11202976/7a7d1439f0732d5542d775aee2bfb3cc62022217pl


---------------------- 50927     ----------


–î–õ–Ø –ú–Ø–°–ê: –∫—Ä–∞—Å–Ω—ã–π, —á–µ—Ä–Ω—ã–π, –¥—É—à–∏—Å—Ç—ã–π –ø–µ—Ä–µ—Ü –∏–ª–∏ –≥–≤–æ–∑–¥–∏–∫–∞, –º–∞–π–æ—Ä–∞–Ω, —Ç–∏–º—å—è–Ω, —Ç–º–∏–Ω, –∫—É—Ä–∫—É–º–∞, –ª—É–∫, –æ—Ä–µ–≥–∞–Ω–æ.

–î–õ–Ø –ü–¢–ò–¶–´: —Ç–∏–º—å—è–Ω, –º–∞–π–æ—Ä–∞–Ω, —Ä–æ–∑–º–∞—Ä–∏–Ω, —à–∞–ª—Ñ–µ–π, —á–∞–±—Ä–µ—Ü, –±–∞–∑–∏–ª–∏–∫.

–î–õ–Ø –†–´–ë–´: –ª–∞–≤—Ä–æ–≤—ã–π –ª–∏—Å—Ç, –±–µ–ª—ã–π –ø–µ—Ä–µ—Ü, –∏–º–±–∏—Ä—å, –¥—É—à–∏—Å—Ç—ã–π –ø–µ—Ä–µ—Ü, –ª—É–∫, –∫–æ—Ä–∏–∞–Ω–¥—Ä, –ø–µ—Ä–µ—Ü —á–∏–ª–∏, –≥–æ—Ä—á–∏—Ü–∞, —É–∫—Ä–æ–ø, —Ç–∏–º—å—è–Ω.

–î–õ–Ø –ì–†–ò–õ–Ø: –∫—Ä–∞—Å–Ω—ã–π –ø–µ—Ä–µ—Ü, –¥—É—à–∏—Å—Ç—ã–π –ø–µ—Ä–µ—Ü, –∫–∞—Ä–¥–∞–º–æ–Ω, —Ç–∏–º—å—è–Ω, –º–∞–π–æ—Ä–∞–Ω, –º—É—Å–∫–∞—Ç–Ω—ã–π –æ—Ä–µ


–ü–∏–Ω–µ—Ç–∫–∏-–±–∞—à–º–∞—á–∫–∏
–û–ø–∏—Å–∞–Ω–∏–µ: http://pinetki-bashmachki.vjagu.ru/23/188/6718/


---------------------- 51262     ----------


https://item.taobao.com/item.htm?spm=2013.1.0.0.516534d5OYWwyx&id=563960878477&scm=1007.12144.81309.42296_42296&pvid=fa255e02-08d8-4892-b3cb-adb845526ad5&utparam=%7B%22x_object_type%22%3A%22item%22%2C%22x_object_id%22%3A563960878477%7D&utparam=%7B%22x_obj


---------------------- 51263     ----------


–û–ª–µ–≥ –ì–∞–∑–º–∞–Ω–æ–≤, –∑–∞—á–µ–º —Ç—ã –æ—Ç–ø—Ä–∞–≤–∏–ª —Ä–µ–±—è—Ç –≤ –ê–¢–û? RSA-R7kYXr4 ok.ru/video/1291089482013


---------------------- 51264     ----------


–ù–µ–≤–æ–∑–º–æ–∂–Ω–æ —Å–¥–µ—Ä–∂–∞—Ç—å —É–ª—ã–±–∫—É )


---------------------- 51265     ----------


Havas qildim piyolaga. Choynak unga boshin egar. Lablari uning doim. Go'zal lablarga tegar.


---------------------- 51266     ----------


’Ä’∏÷Ç’¥’∏÷Ä’°’µ’´’∂,’∂’∏÷Ä ’Ω’´’©÷Ñ’∏’¥ ’°’¥’•’∂’°’¨’°’æ ’§’•÷Ä’°’Ω’°’∂’∂’•÷Ä’´ ’¥’°’Ω’∂’°’Ø÷Å’∏÷Ç’©’µ’°’¥’¢..‘¥’´’ø’•÷Ñ’ù http://zarmanq.c


–°–°–°–† –∑–∏–º–∞ 1985. –°–≤–∞–¥—å–±–∞ Ôøº


---------------------- 51655     ----------


–ò–≥—Ä–∞ –≤ —Ñ—É—Ç–±–æ–ª —Ç–∞—á–∫–∞–º–∏ http://cdn-ok.com/embed?id=160530533&format=1&sig=b86e5ec668ae6af8&rand=1522421290 https://cdn-ok.com/embed?id=160530533&format=1&sig=b86e5ec668ae6af8&rand=1522421290


---------------------- 51656     ----------


–ê–ª—å–±–∞ ‚Ä¢ 10 –º–µ—Å—Ç, –ó–ê–ü–†–ï–©–ï–ù–ù–´–• –∫ –ü–û–°–ï–©–ï–ù–ò–Æ http://out.pladform.ru/player?pl=18079&type=html5&videoid=101978500 ok.ru/video/242340664381


---------------------- 51657    recipe ----------


–ë–ê–õ–¨–ó–ê–ú –°–í–Ø–©–ï–ù–ù–ò–ö–ê
 –û–¥–Ω–∞–∂–¥—ã –≤ –ø–æ–µ–∑–¥–µ –º–æ–∏–º —Å–æ—Å–µ–¥–æ–º –æ–∫–∞–∑–∞–ª—Å—è —Å–≤—è—â–µ–Ω–Ω–∏–∫. –û–Ω —É–≤–∏–¥–µ–ª, —á—Ç–æ —è –ø—å—é —Å–µ—Ä–¥–µ—á–Ω—ã–µ –∫–∞–ø–ª–∏, —Ä–∞—Å—Å–ø—Ä–æ—Å–∏–ª –æ –±–æ–ª–µ–∑–Ω—è—Ö –∏ –ø–æ—Ä–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞–ª —Ä–µ—Ü–µ–ø—Ç –±–∞–ª—å–∑–∞–º–∞.
–ù–∞–¥–æ –≤–∑—è—Ç—å:
- –≤–µ—Ç–æ—á–∫–∏ –º–∞–ª–∏–Ω—ã (–ø–µ—Ä–≤–æ–≥–æ –≥–æ–¥–∞), –≤–∏—à–Ω–∏, —á–µ—Ä–µ–º—É—Ö–∏
- –ø–æ —Å—Ç–æ–ª–æ–≤–æ–π –ª

–ó–≤–æ–Ω–∏—Ç–µ 80256437768 (–í–∞–π–±–µ—Ä –∏ —Ç.–¥.). –í –Ω–∞–ª–∏—á–∏–∏ –∏ –ø–æ–¥ –∑–∞–∫–∞–∑ –≤—Å–µ —Ä–∞–∑–º–µ—Ä—ã!!!–í–æ–∑–º–æ–∂–Ω–∞ –¥–æ—Å—Ç–∞–≤–∫–∞ –ø–æ –†–ë –≤ —Ç–µ—á–µ–Ω–∏–∏ 3-5 –¥–Ω–µ–π!!! –û–ø—Ç –∏ —Ä–æ–∑–Ω–∏—Ü–∞.–í–æ–∑–º–æ–∂–Ω–∞ –æ–ø–ª–∞—Ç–∞ –ø–æ –∫–∞—Ä—Ç–µ –•–∞–ª–≤–∞.–¶–µ–Ω–∞ 80 —Ä—É–±–ª–µ–π.


---------------------- 52011     ----------


–ò—é–ª—å, –∂–∞—Ä–∫–æ, –≥—Ä–æ–∑–∞ –≤—Å—é –Ω–æ—á—å. –ó–∞–∫—Ä—ã–≤–∞—é –±–∞—Ä, —É–∂–µ —á–µ—Ç—ã—Ä–µ —É—Ç—Ä–∞ –≥–¥–µ-—Ç–æ, –¥–æ—Å—Ç–∞–ª –∫–ª—é—á–∏ ‚Äî —Å–æ–±–∏—Ä–∞—é—Å—å –Ω–∞ —Å–∏–≥–Ω–∞–ª–∫—É –ø–æ—Å—Ç–∞–≤–∏—Ç—å. –ò —Ç—É—Ç –∑–∞–±–µ–≥–∞–µ—Ç —á—É–≤–∞–∫ - –æ–¥–µ—Ç—ã–π —Ç–æ–ª—å–∫–æ –≤ —á–µ—Ä–Ω—ã–π —Ö–∞–ª–∞—Ç —Å –∫–∞–ø—é—à–æ–Ω–æ–º –Ω–∞ –≥–æ–ª–æ–µ —Ç–µ–ª–æ, –≤ –±–∞–Ω–Ω—ã—Ö —Ç–∞–ø–æ—á–∫–∞—Ö –∏ –∫–æ–∂–∞–Ω—ã—Ö –ø–µ—Ä—á–∞—Ç–∫–∞—Ö. –ì–æ–≤–æ—Ä–∏—Ç, —á—Ç–æ –µ–º—É –Ω–∞–¥–æ —Å–ø—Ä—è—Ç–∞—Ç—å—Å—è, –ø—Ä–æ—Å–∏—Ç –∑–∞–∫—Ä—ã—Ç—å –¥–≤–µ—Ä—å. –Ø –ø–æ–Ω—è–ª —Ç–æ–ª—å–∫–æ, —á—Ç–æ –∑–∞ –Ω–∏–º –∫—Ç–æ-—Ç–æ –≥–æ–Ω–∏—Ç—Å—è, –∏ –∑–∞–∫—Ä—ã–ª –∂–∞–ª—é–∑–∏. –ù–∞–ª–∏–ª –µ–º—É –≤–∏—Å–∫


–†–æ–¥–∏—Ç–µ–ª–∏ –æ—Ç–≤–æ–¥–∏–ª–∏ –¥–µ—Ç–µ–π –≤ –¥–µ—Ç.—Å–∞–¥ –∏ –Ω–µ –∑–Ω–∞–ª–∏,—á—Ç–æ —Ç–∞–º –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç!ÔøºÔøº 
–ü–æ–∫–∞ –≤ —Å–∞–¥–∏–∫ –Ω–µ –ø—Ä–∏—à–ª–∞ –Ω–æ–≤–∞—è –Ω—è–Ω–µ—á–∫–∞..Ôøº. –ø—Ä–æ–¥–æ–ª–∂–µ–Ω–∏–µ –≤ –≥—Ä—É–ø–ø–µ https://ok.ru/group55349135605783


---------------------- 52426     ----------


–õ–∏—Ü–µ–Ω–∑–∏–æ–Ω–Ω—ã–µ –ú—É–∂—Å–∫–∏–µ, –ñ–µ–Ω—Å–∫–∏–µ –∞—Ä–æ–º–∞—Ç—ã –ì–æ–ª–ª–∞–Ω–¥—Å–∫–æ–≥–æ –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–∞. –î–æ—Å—Ç—É–ø–Ω–∞—è —Ü–µ–Ω–∞! –ò–¥–µ–Ω—Ç–∏—á–Ω–æ—Å—Ç—å –æ—Ä–∏–≥–∏–Ω–∞–ª—É! –°—Ç–æ–π–∫–æ—Å—Ç—å 6-8 —á–∞—Å–æ–≤! –ü–æ–¥—Ä–æ–±–Ω–µ–µ –≤ –∏–Ω—Ç–µ—Ä–Ω–µ—Ç –º–∞–≥–∞–∑–∏–Ω–µ http://natalina.in.ua/shop/elitnaja_parfjumerija/index.html


---------------------- 52427     ----------


–°–ø–∞—Å–∏–±–æ –∑–∞ –≤–∞—à—É –≤–µ—Ä—É –∏ –ø–æ–¥–¥–µ—Ä–∂–∫—É! 403525405258 ok.ru/video/638974888522


---------------------- 52428     ----------


–ú–£–†–ê–ï–í. –í–´–î–í–û–†–ï–ù–ò–ï –†–û–°–°–ò–ô–°–ö–ò–• –î–ò–ü–õ–û–ú–ê–¢–û–í, –≠–¢–û –§–õ–ï–®–ú–û–ë –ò –ü–û–†–û–®–ï–ù–ö–û –•–û–ß–ï–¢ –ù–†–ê


–ê–ª—å–±–∞ ‚Ä¢ 10 –°–£–ú–ê–°–®–ï–î–®–ò–•, –∫–æ—Ç–æ—Ä—ã–µ –ó–ê–ü–û–°–¢–ò–õ–ò –®–û–ö–ò–†–£–Æ–©–ò–ï –≤–µ—â–∏ –≤ –°–û–¶–°–ï–¢–Ø–• http://out.pladform.ru/player?pl=18079&type=html5&videoid=101968178 ok.ru/video/241899607259


---------------------- 52784    adv ----------


—Å—Ç—Ä–∞–∑—ã –≥–æ—Ä–Ω—ã–π —Ö—Ä—É—Å—Ç–∞–ª—å –ø—Ä–∏—à–∏–≤–Ω—ã–µ 4 –º–º 1 —Ä—É–±./—à—Ç.


---------------------- 52785    recipe ----------


–õ–æ–¥–æ—á–∫–∏ –∏–∑ —Å–ª–æ–µ–Ω–æ–≥–æ —Ç–µ—Å—Ç–∞ —Å –∫–∞—Ä—Ç–æ—Ñ–µ–ª–µ–º, –º—è—Å–æ–º –∏ –æ–≥—É—Ä—á–∏–∫–∞–º–∏ –≤ –¥–æ–º–∞—à–Ω–∏—Ö —É—Å–ª–æ–≤–∏—è—Ö!  –†–µ—Ü–µ–ø—Ç —Å —Ñ–æ—Ç–æ: >>>  http://http://r1.0-ede.ru/lodochki-iz-sloenogo-testa-s-kartofelem-myasom-i-ogurchikami/ –ë–æ–ª—å—à–µ –≤–∫—É—Å–Ω—ã—Ö —Ä–µ—Ü–µ–ø—Ç–æ–≤ –≤ –≥—Ä—É–ø–ø–µ:
https://ok.ru/group/56838620905523 ok.ru/proshuk ok.ru/group/56838620905523


---------------------- 52786     ----------


–û—Å—Ça–≤–ª—è—è —à—Äa–º—ã –¥—Ä—É–≥–∏–º —Å–≤–æ–∏ –Ω–µ –∏–∑–ªe—á–∏—à—å.


---------------------- 52787     ----------


—Ö—É–∂–µ –≤—Å–µ–≥–æ 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [17]:
train_texts.text.str.count('—è–Ω–≤–∞—Ä|—Ñ–µ–≤—Ä–∞–ª|–º–∞—Ä—Ç|–∞–ø—Ä–µ–ª|–º–∞–π|–∏—é–Ω|–∏—é–ª|–∞–≤–≥—É—Å—Ç|—Å–µ–Ω—Ç—è–±—Ä|–æ–∫—Ç—è–±—Ä|–Ω–æ—è–±—Ä|–¥–µ–∫–∞–±—Ä' + 
                                  '|–ø–æ–Ω–µ–¥–µ–Ω—å–Ω–∏–∫|–≤—Ç–æ—Ä–Ω–∏–∫|—Å—Ä–µ–¥–∞|—Å—Ä–µ–¥—É|—á–µ—Ç–≤–µ—Ä–≥|–ø—è—Ç–Ω–∏—Ü|—Å—É–±–±–æ—Ç–∞|—Å—É–±–±–æ—Ç—É|–≤–æ—Å–∫—Ä–µ—Å–µ') \
        .clip(upper=1).mean()

0.0892525644137821

In [15]:
679252 / train_texts.shape[0]

0.19914064139955368

In [7]:
train_texts.shape

(3410916, 4)

In [11]:
train_texts.objectId.unique().shape

(3352714,)

In [41]:
train_texts.head(100000).text.str.count('\.(mpg|mp4|mp3)').sum()

207

In [7]:
train_texts

Unnamed: 0,objectId,lang,text,preprocessed
0,11181946,ru,"–ü–∏—Ç–∫–µ—Ä–Ω–∏—è\r\n\r\n–û—á–µ–Ω—å –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ–µ —Ä–∞—Å—Ç–µ–Ω–∏–µ, –ø—Ä...","[–ø–∏—Ç–∫–µ—Ä–Ω, –æ—á–µ–Ω, –∏–Ω—Ç–µ—Ä–µ—Å–Ω, —Ä–∞—Å—Ç–µ–Ω, –ø—Ä–æ–∏–∑—Ä–∞—Å—Ç–∞, ..."
1,12040268,Unknown,"–Ø—Ö—Ç—ã, –æ–ª–∏–≥–∞—Ä—Ö–∏, –ø—Ä–æ—Å—Ç–∏—Ç—É—Ç–∫–∏: —Å–µ–∫—Å-–æ—Ö–æ—Ç–Ω–∏—Ü–∞ —Ä–∞–∑...","[—è—Ö—Ç, –æ–ª–∏–≥–∞—Ä—Ö, –ø—Ä–æ—Å—Ç–∏—Ç—É—Ç–∫, —Å–µ–∫—Å, –æ—Ö–æ—Ç–Ω–∏—Ü, —Ä–∞–∑–æ..."
2,14050867,ru,"–ö—Ç–æ-—Ç–æ –≥–∏–±–Ω–µ—Ç –≤ –±–æ—é, –ø–æ–¥—Ä—ã–≤–∞—è —Å–µ–±—è –≥—Ä–∞–Ω–∞—Ç–æ–π, –∞...","[–≥–∏–±–Ω–µ—Ç, –±–æ, –ø–æ–¥—Ä—ã–≤, –≥—Ä–∞–Ω–∞—Ç, –∫–æ–≥, –≤–µ–¥—É—Ç, –±–æ, –≤..."
3,17023591,ru,–û—Ç–Ω–æ—à–µ–Ω–∏—è: –∫–æ–≥–¥–∞ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç –≤—ã–±–æ—Ä? –°–≤—è—Ç–æ—Å–ª–∞–≤ –†...,"[–æ—Ç–Ω–æ—à–µ–Ω, –ø—Ä–æ–∏—Å—Ö–æ–¥, –≤—ã–±–æ—Ä, —Å–≤—è—Ç–æ—Å–ª–∞, —Ä–∞–π–∫, —á–∏—Ç..."
4,18389833,Unknown,ok.ru/group/51094392012955 ok.ru/giflive ok.ru...,[]
5,24344214,ru,–§–ö ¬´–†–æ—Ç–æ—Ä-–í–æ–ª–≥–æ–≥—Ä–∞–¥¬ª - ¬´–°–ø–∞—Ä—Ç–∞–∫-2¬ª (–ú–æ—Å–∫–≤–∞) 1:...,"[—Ñ–∫, —Ä–æ—Ç–æ—Ä, –≤–æ–ª–≥–æ–≥—Ä–∞–¥, —Å–ø–∞—Ä—Ç–∞–∫, –º–æ—Å–∫–≤, –≥–æ–ª, –∞–ø..."
6,24544853,ru,–ò. –ê–Ω–∏—Å–∏–º–æ–≤ - –ò—Å—Ç–æ—Ä–∏—è –≤–∑—è—Ç–∞—è –∏–∑ –∂–∏–∑–Ω–∏ (—Å—Ç–∏—Ö–∏) ...,"[–∞–Ω–∏—Å–∏–º, –∏—Å—Ç–æ—Ä, –≤–∑—è—Ç, –∂–∏–∑–Ω, —Å—Ç–∏—Ö]"
7,24583002,ru,"–ö–∞–∫ –±—ã –≤—ã–≥–ª—è–¥–µ–ª–∏ —Ö–∏–º–∏—á–µ—Å–∫–∏–µ —ç–ª–µ–º–µ–Ω—Ç—ã, –µ—Å–ª–∏ –±—ã ...","[–≤—ã–≥–ª—è–¥–µ–ª, —Ö–∏–º–∏—á–µ—Å–∫, —ç–ª–µ–º–µ–Ω—Ç, –ª—é–¥—å–º]"
8,26053581,ru,–û—á–µ–Ω—å —Å—Ç—Ä–∞–Ω–Ω—ã–µ –¥–µ–ª–∞ –ø—Ä–æ–∏—Å—Ö–æ–¥—è—Ç —Å –¢–æ–º–æ–º –•–∞—Ä–¥–∏ –≤...,"[–æ—á–µ–Ω, —Å—Ç—Ä–∞–Ω, –¥–µ–ª, –ø—Ä–æ–∏—Å—Ö–æ–¥, —Ç–æ–º, —Ö–∞—Ä–¥, –ø–µ—Ä–≤, ..."
9,26853081,ru,–î–ï–†–ï–í–Ø–ù–ù–ê–Ø –õ–ï–°–¢–ù–ò–¶–ê - –∏–∑ –Ω–∞—Ç—É—Ä–∞–ª—å–Ω–æ–≥–æ –¥–µ—Ä–µ–≤–∞ ...,"[–¥–µ—Ä–µ–≤—è, –ª–µ—Å—Ç–Ω–∏—Ü, –Ω–∞—Ç—É—Ä–∞–ª—å–Ω, –¥–µ—Ä–µ–≤, –æ—Å—Ç–∞—ë—Ç, –ø—Ä..."


In [14]:
train_texts[['objectId', 'lang']].groupby('objectId').agg({'lang': 'count'})['lang'].reset_index() \
    .sort_values(['lang'], ascending=[False])

Unnamed: 0,objectId,lang
31766,487941,277
42767,556010,255
6477,339044,185
41284,546672,106
2,1283,73
44768,568009,59
41877,550628,55
23134,437460,55
1,677,41
1343547,18229844,33


In [11]:
a = train_texts[train_texts.objectId == 19465260]

In [12]:
for i in range(a.shape[0]):
    print(a.iloc[i,:].text)
    print('----------------------------------------------')

Scorpions- "Is There Anybody There" 1979 TV 50955946725 https://youtu.be/R49lfk9Y2b0
----------------------------------------------


In [21]:
train_texts[train_texts.objectId == 1195330]

Unnamed: 0,objectId,lang,text,preprocessed
165288,1195330,Unknown,160—Ä—É–±. - –ö–û–õ–¨–¶–û. –û–°–¢–ê–¢–ö–ò –†–ê–ó–ú–ï–†–û–í:\n3–º–º: 18.8...,"[—Ä—É–±, –∫–æ–ª—å—Ü, –æ—Å—Ç–∞—Ç–∫, —Ä–∞–∑–º–µ—Ä, –º–º, –º–º, xuping, –ø..."
613512,1195330,Unknown,160—Ä—É–±. - –ö–û–õ–¨–¶–û. –û–°–¢–ê–¢–ö–ò –†–ê–ó–ú–ï–†–û–í:\n3–º–º: 18(1...,"[—Ä—É–±, –∫–æ–ª—å—Ü, –æ—Å—Ç–∞—Ç–∫, —Ä–∞–∑–º–µ—Ä, –º–º, –º–º, xuping, –ø..."
646510,1195330,Unknown,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1106002,1195330,Unknown,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1115746,1195330,Unknown,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1605411,1195330,ru,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1648239,1195330,ru,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1691453,1195330,Unknown,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1708836,1195330,Unknown,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."
1757533,1195330,ru,"150—Ä—É–±. - –ö–û–õ–¨–¶–û –û–ë–†–£–ß–ê–õ–¨–ù–û–ï ""XUPING"":\n3–º–º: 1...","[—Ä—É–±, –∫–æ–ª—å—Ü, –æ–±—Ä—É—á–∞–ª—å–Ω, xuping, –º–º, –º–º, xuping..."


In [9]:
train_texts[train_texts.objectId == 24762087]

Unnamed: 0,objectId,lang,text,preprocessed
222919,24762087,ru,"–ü—Ä–∞–≤–æ—Å–ª–∞–≤–Ω—ã–π ‚Ä† –∫–∞–ª–µ–Ω–¥–∞—Ä—å. –í—Ç–æ—Ä–Ω–∏–∫, 27 –º–∞—Ä—Ç–∞, 2...","[–ø—Ä–∞–≤–æ—Å–ª–∞–≤–Ω, –∫–∞–ª–µ–Ω–¥–∞—Ä, –≤—Ç–æ—Ä–Ω–∏–∫, –º–∞—Ä—Ç, –º–∞—Ä—Ç, —Å—Ç..."
323206,24762087,ru,"–ü—Ä–∞–≤–æ—Å–ª–∞–≤–Ω—ã–π ‚Ä† –∫–∞–ª–µ–Ω–¥–∞—Ä—å.¬†–ß–µ—Ç–≤–µ—Ä–≥, 29 –º–∞—Ä—Ç–∞, 2...","[–ø—Ä–∞–≤–æ—Å–ª–∞–≤–Ω, –∫–∞–ª–µ–Ω–¥–∞—Ä, —á–µ—Ç–≤–µ—Ä–≥, –º–∞—Ä—Ç, –º–∞—Ä—Ç, —Å—Ç..."
364810,24762087,ru,"–ü—Ä–∞–≤–æ—Å–ª–∞–≤–Ω—ã–π ‚Ä† –∫–∞–ª–µ–Ω–¥–∞—Ä—å. –ü—è—Ç–Ω–∏—Ü–∞, 23 –º–∞—Ä—Ç–∞, ...","[–ø—Ä–∞–≤–æ—Å–ª–∞–≤–Ω, –∫–∞–ª–µ–Ω–¥–∞—Ä, –ø—è—Ç–Ω–∏—Ü, –º–∞—Ä—Ç, –º–∞—Ä—Ç, —Å—Ç,..."
507452,24762087,ru,"–ü—Ä–∞–≤–æ—Å–ª–∞–≤–Ω—ã–π ‚Ä† –∫–∞–ª–µ–Ω–¥–∞—Ä—å. –í–æ—Å–∫—Ä–µ—Å–µ–Ω—å–µ, 25 –º–∞—Ä...","[–ø—Ä–∞–≤–æ—Å–ª–∞–≤–Ω, –∫–∞–ª–µ–Ω–¥–∞—Ä, –≤–æ—Å–∫—Ä–µ—Å–µ–Ω, –º–∞—Ä—Ç, –º–∞—Ä—Ç, ..."
533885,24762087,ru,"–ü—Ä–∞–≤–æ—Å–ª–∞–≤–Ω—ã–π ‚Ä† –∫–∞–ª–µ–Ω–¥–∞—Ä—å. –°—Ä–µ–¥–∞, 28 –º–∞—Ä—Ç–∞, 201...","[–ø—Ä–∞–≤–æ—Å–ª–∞–≤–Ω, –∫–∞–ª–µ–Ω–¥–∞—Ä, —Å—Ä–µ–¥, –º–∞—Ä—Ç, –º–∞—Ä—Ç, —Å—Ç, —Å..."
1082010,24762087,ru,"–ü—Ä–∞–≤–æ—Å–ª–∞–≤–Ω—ã–π ‚Ä† –∫–∞–ª–µ–Ω–¥–∞—Ä—å. –°—É–±–±–æ—Ç–∞, 17 –º–∞—Ä—Ç–∞, 2...","[–ø—Ä–∞–≤–æ—Å–ª–∞–≤–Ω, –∫–∞–ª–µ–Ω–¥–∞—Ä, —Å—É–±–±–æ—Ç, –º–∞—Ä—Ç, –≥, –±–ª–≥–≤, ..."
1140628,24762087,ru,"–ü—Ä–∞–≤–æ—Å–ª–∞–≤–Ω—ã–π ‚Ä†¬†–∫–∞–ª–µ–Ω–¥–∞—Ä—å.¬†–í–æ—Å–∫—Ä–µ—Å–µ–Ω—å–µ, 18 –º–∞—Ä—Ç...","[–ø—Ä–∞–≤–æ—Å–ª–∞–≤–Ω, –∫–∞–ª–µ–Ω–¥–∞—Ä, –≤–æ—Å–∫—Ä–µ—Å–µ–Ω, –º–∞—Ä—Ç, –º–∞—Ä—Ç, ..."
1813732,24762087,ru,"–ü—Ä–∞–≤–æ—Å–ª–∞–≤–Ω—ã–π ‚Ä† –∫–∞–ª–µ–Ω–¥–∞—Ä—å. –°—Ä–µ–¥–∞, 21 –º–∞—Ä—Ç–∞, 201...","[–ø—Ä–∞–≤–æ—Å–ª–∞–≤–Ω, –∫–∞–ª–µ–Ω–¥–∞—Ä, —Å—Ä–µ–¥, –º–∞—Ä—Ç, –≥, –∏–∫–æ–Ω, –±–æ..."
2152473,24762087,ru,–°–†–û–ß–ù–´–ô –°–ë–û–† –°–†–ï–î–°–¢–í –ù–ê –í–û–°–°–¢–ê–ù–û–í–õ–ï–ù–ò–ï –°–ü–ê–°–û -...,"[—Å—Ä–æ—á–Ω, —Å–±–æ—Ä, —Å—Ä–µ–¥—Å—Ç–≤, –≤–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω, —Å–ø–∞—Å, –ø—Ä–µ..."
2445603,24762087,ru,"–ü—Ä–∞–≤–æ—Å–ª–∞–≤–Ω—ã–π ‚Ä† –∫–∞–ª–µ–Ω–¥–∞—Ä—å. –°—É–±–±–æ—Ç–∞, 24 –º–∞—Ä—Ç–∞, ...","[–ø—Ä–∞–≤–æ—Å–ª–∞–≤–Ω, –∫–∞–ª–µ–Ω–¥–∞—Ä, —Å—É–±–±–æ—Ç, –º–∞—Ä—Ç, –≥, –ø–æ—Ö–≤–∞,..."
