In [1]:
%load_ext autoreload
%autoreload 1
%aimport my

import sys
import numpy as np
import pandas as pd
import os
import gc

import my
from my import p
import text_utils

pd.set_option("max_colwidth", 25)
pd.set_option("display.precision", 1)
pd.options.display.float_format = "{:.3f}".format

from IPython.display import HTML, display
     
dir_out = "out/"
dir_data = 'data/'
os.makedirs(dir_out, exist_ok = True)
RANDOM_STATE = 34

np.random.seed(RANDOM_STATE)
N_CPU = os.cpu_count()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Загрузка файлов с данными

In [2]:
train = pd.read_csv(dir_data+'train_dataset_train.csv',parse_dates=['publish_date'])
test = pd.read_csv(dir_data+'test_dataset_test.csv',parse_dates=['publish_date'])
train[:1]

Unnamed: 0,document_id,title,publish_date,session,authors,ctr,category,tags,views,depth,full_reads_percent
0,624ac09c9a7947db3d80c...,Европейский банк разв...,2022-04-04 10:29:44,IDE7mtH4RBqGn-8MXfGffQ,[],1.58,5409f11ce063da9c8b588a18,['55928d339a794751dc8...,20460,1.134,35.85


In [3]:
import ast

def str_to_list(s):
    return ast.literal_eval(s)


def encode_list_col(li, encoder, prefix=''):
    if not li:
        return f'{prefix}100000'
    return prefix + f' {prefix}'.join((np.sort(encoder.transform(li)) + 1).astype(str))

### Label Encoders

Fit'им энкодеры, чтобы из таких записей '55928d339a794751dc8' можно было сделать более удобные: например для тагов 't1', для авторов 'a1', для категорий просто цифры 1...9

In [4]:
df = pd.concat([train,test]).reset_index(drop=True)
p(df.shape)
df[:1]

(10000, 11) 
 ~


Unnamed: 0,document_id,title,publish_date,session,authors,ctr,category,tags,views,depth,full_reads_percent
0,624ac09c9a7947db3d80c...,Европейский банк разв...,2022-04-04 10:29:44,IDE7mtH4RBqGn-8MXfGffQ,[],1.58,5409f11ce063da9c8b588a18,['55928d339a794751dc8...,20460.0,1.134,35.85


In [5]:
from sklearn.preprocessing import LabelEncoder
encoders = {}
encoders['category_enc'] = LabelEncoder().fit(df['category'])
encoders['session_enc'] = LabelEncoder().fit(df['session'])
my.save_pickle(dir_data+'encs.pcl', encoders)

In [6]:
df.dtypes

title                   object
publish_date    datetime64[ns]
session                 object
authors                 object
ctr                    float64
category                object
tags                    object
dtype: object

In [7]:
df['authors'] = df['authors'].apply(str_to_list)
df['tags'] = df['tags'].apply(str_to_list)

In [8]:
all_authors = list(set(df['authors'].sum()))
all_tags = list(set(df['tags'].sum()))
len(all_authors), len(all_tags)

(102, 6520)

In [9]:
encoders['authors_enc'] = LabelEncoder().fit(all_authors)
encoders['tags_enc'] = LabelEncoder().fit(all_tags)
my.save_pickle(dir_data+'encs.pcl', encoders)

In [10]:
df['authors'] = df['authors'].apply(encode_list_col,args=(encoders['authors_enc'],'a'))
df['tags'] = df['tags'].apply(encode_list_col,args=(encoders['tags_enc'],'t'))
df[:3]

Unnamed: 0,title,publish_date,session,authors,ctr,category,tags
0,Европейский банк разв...,2022-04-04 10:29:44,IDE7mtH4RBqGn-8MXfGffQ,a100000,1.58,5409f11ce063da9c8b588a18,t3 t762 t3501 t3657
1,Кремль назвал регуляр...,2022-02-18 10:00:39,KtVJsteHStO5oditt3Uvzw,a6 a23,1.853,5409f11ce063da9c8b588a12,t92 t2214 t5494
2,Госсекретарь Швеции з...,2022-02-12 04:24:02,hk7puWJwSziw0m3sfTkKWA,a100000,0.0,5409f11ce063da9c8b588a12,t36 t91 t823


In [11]:
encoders['list_authors_enc'] = LabelEncoder().fit(df['authors'])
encoders['list_tags_enc'] = LabelEncoder().fit(df['tags'])
my.save_pickle(dir_data+'encs.pcl', encoders)

In [12]:
df['authors'].nunique(),df['tags'].nunique()

(633, 9468)

### Точные предсказания
Путем визуального осмотра установлено, что первые 24 символа document_id определяют часть URL страницы новости. И так как в тесте есть такие же 24 символа document_id, как и в трейне, то для нескольких новостей из теста есть дубликаты в трейне, из которых можно взять точные предсказания. Есть образцы с одинаковым путем, но разным ctr, и по ctr можно соотнести их между собой точно. Такое предположение также подтверждается полным соответствием категории, тагов, авторов, и таргетов, почти всегда одинаковый стр и похожее название, (отличаются датой публикации), если мы посмотрим только на дубликаты среди трейна.
Сделаем таблицу для новостей из теста, имеющих дубликаты в трейне:

In [6]:
df['path'] = df['document_id'].str[:24]

# всего 136 дубликатов на трейн и тест
dupls = df[df.duplicated(subset=['path'],keep=False)].sort_values('ctr')
p(dupls.shape)
dupls[:5]

(136, 12) 
 ~


Unnamed: 0,document_id,title,publish_date,session,authors,ctr,category,tags,views,depth,full_reads_percent,path
5693,621a39ba9a79472784f02...,Какие места на Украин...,2022-05-26 10:12:01,uzbF3bemQHyNHWFEVMC8bQ,[],0.0,5409f11ce063da9c8b588a12,['5409f191e063daa0f40...,70873.0,1.113,15.581,621a39ba9a79472784f029d4
3213,621a39ba9a79472784f02...,Какие места на Украин...,2022-05-19 09:57:05,1UsUPXuzR0CKpx63H0qNOQ,[],0.0,5409f11ce063da9c8b588a12,['5409f191e063daa0f40...,70873.0,1.113,15.581,621a39ba9a79472784f029d4
8380,621a39ba9a79472784f02...,Какие места на Украин...,2022-04-08 11:28:18,_hvIU0ZcStKn-km2-kPOmw,[],0.0,5409f11ce063da9c8b588a12,['5409f191e063daa0f40...,,,,621a39ba9a79472784f029d4
6247,621a39ba9a79472784f02...,Какие места на Украин...,2022-05-23 09:44:02,ytL7Y-dhSYm55oHJbra_dg,[],0.0,5409f11ce063da9c8b588a12,['5409f191e063daa0f40...,70873.0,1.113,15.581,621a39ba9a79472784f029d4
5908,621a39ba9a79472784f02...,Какие места на Украин...,2022-05-24 10:41:48,347x3OJRQy-WukszVEW-GA,[],0.0,5409f11ce063da9c8b588a12,['5409f191e063daa0f40...,70873.0,1.113,15.581,621a39ba9a79472784f029d4


In [19]:
# дубликаты в трейне
dupls_from_train = dupls[~dupls['views'].isna()].drop_duplicates(subset=['path','ctr']).rename(columns={'document_id':'document_id_train','title':'title_train'}).set_index('path')
p(dupls_from_train.shape)
dupls_from_train[:2]

(41, 11) 
 ~


Unnamed: 0_level_0,document_id_train,title_train,publish_date,session,authors,ctr,category,tags,views,depth,full_reads_percent
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
621a39ba9a79472784f029d4,621a39ba9a79472784f02...,Какие места на Украин...,2022-05-26 10:12:01,uzbF3bemQHyNHWFEVMC8bQ,[],0.0,5409f11ce063da9c8b588a12,['5409f191e063daa0f40...,70873.0,1.113,15.581
609d11a69a79477cef513697,609d11a69a79477cef513...,Как Запад и Россия по...,2022-04-08 15:53:09,err0XKLMQFGLne6KPYAZEQ,['542f8085cbb20f3a1e3...,0.523,5409f11ce063da9c8b588a12,['596cb0199a794738666...,7601.0,1.081,20.563


In [21]:
dupls_from_train.ctr.nunique()

41

In [23]:
# дубликаты в тесте
dupls_from_test = dupls.loc[dupls['views'].isna(),['document_id','path','ctr']].rename(columns={'document_id':'document_id_test'})
p(dupls_from_test.shape)
dupls_from_test[:2]

(37, 3) 
 ~


Unnamed: 0,document_id_test,path,ctr
8380,621a39ba9a79472784f02...,621a39ba9a79472784f029d4,0.0
7539,621a39ba9a79472784f02...,621a39ba9a79472784f029d4,0.0


In [25]:
# образцам из теста находим дубликат в трейне
accurate_preds = dupls_from_test.merge(dupls_from_train,left_on='ctr',right_on='ctr')
p(accurate_preds.shape)
accurate_preds[:5]

(32, 13) 
 ~


Unnamed: 0,document_id_test,path,ctr,document_id_train,title_train,publish_date,session,authors,category,tags,views,depth,full_reads_percent
0,621a39ba9a79472784f02...,621a39ba9a79472784f029d4,0.0,621a39ba9a79472784f02...,Какие места на Украин...,2022-05-26 10:12:01,uzbF3bemQHyNHWFEVMC8bQ,[],5409f11ce063da9c8b588a12,['5409f191e063daa0f40...,70873.0,1.113,15.581
1,621a39ba9a79472784f02...,621a39ba9a79472784f029d4,0.0,621a39ba9a79472784f02...,Какие места на Украин...,2022-05-26 10:12:01,uzbF3bemQHyNHWFEVMC8bQ,[],5409f11ce063da9c8b588a12,['5409f191e063daa0f40...,70873.0,1.113,15.581
2,609d11a69a79477cef513...,609d11a69a79477cef513697,0.523,609d11a69a79477cef513...,Как Запад и Россия по...,2022-04-08 15:53:09,err0XKLMQFGLne6KPYAZEQ,['542f8085cbb20f3a1e3...,5409f11ce063da9c8b588a12,['596cb0199a794738666...,7601.0,1.081,20.563
3,609d11a69a79477cef513...,609d11a69a79477cef513697,0.599,609d11a69a79477cef513...,Как Запад и Россия по...,2022-04-25 16:04:46,FjtKTKH7S9Oxncnerr-7xA,['542f8085cbb20f3a1e3...,5409f11ce063da9c8b588a12,['596cb0199a794738666...,14217.0,1.06,21.953
4,6082cca79a79471b74bf1...,6082cca79a79471b74bf1eda,0.709,6082cca79a79471b74bf1...,Как менялась ключевая...,2022-04-08 10:31:13,YlQoBTIqQ7qIP9WhN40PPw,['5b8403979a794771079...,5409f11ce063da9c8b588a18,['5409f174e063daa0f40...,22374.0,1.186,44.61


In [26]:
cols = ['document_id_test', 'views', 'depth', 'full_reads_percent']
accurate_preds = accurate_preds[cols].sort_values('views')
accurate_preds

Unnamed: 0,document_id_test,views,depth,full_reads_percent
2,609d11a69a79477cef513...,7601.0,1.081,20.563
24,627518769a794781b88bc...,12258.0,1.061,19.261
12,628dc8fa9a79471596fd5...,12586.0,1.07,47.958
3,609d11a69a79477cef513...,14217.0,1.06,21.953
21,625568df9a794741e114a...,14931.0,1.067,30.219
20,625568df9a794741e114a...,14931.0,1.067,30.219
9,625568df9a794741e114a...,17893.0,1.075,34.628
8,61f401bf9a79478e78ad4...,20337.0,1.17,19.039
4,6082cca79a79471b74bf1...,22374.0,1.186,44.61
11,5f92bea39a79475229897...,28497.0,1.164,29.445


In [27]:
accurate_preds.to_csv(dir_data+'accurate_preds.csv',index=False)

In [None]:
# Определим функцию, которая будет в готовом submission корректировать предсказания модели, имеющимися точными предсказаниями
def correct_preds(sub):
    # если модель предсказала отрицательные или очень маленькие значения - это наверняка ошибка - исправляем
    sub.loc[sub['views'] < 500, 'views'] = 500

    all_dupls = pd.read_csv(dir_data+'accurate_preds.csv')

    for i in all_dupls['document_id_test']:
        sub.loc[sub['document_id'] == i, ['views', 'depth', 'full_reads_percent']
                ] = all_dupls.loc[all_dupls['document_id_test'] == i, ['views', 'depth', 'full_reads_percent']].to_numpy().flatten()

    return sub

### Обработка данных:
- label encodings
- извлечение признаков
- удаление выбросов
- очистка и обработка текста


In [4]:
train = pd.read_csv(dir_data+'train_dataset_train.csv',parse_dates=['publish_date'])
test = pd.read_csv(dir_data+'test_dataset_test.csv',parse_dates=['publish_date'])

encoders = my.load_pickle(dir_data+"encs.pcl")
encoders

{'session_enc': LabelEncoder(),
 'authors_enc': LabelEncoder(),
 'tags_enc': LabelEncoder(),
 'list_authors_enc': LabelEncoder(),
 'list_tags_enc': LabelEncoder(),
 'category_enc': LabelEncoder()}

In [5]:
def number_words(text):
    n = 0
    for word in text.split():
        if len(word)>1:
            n+=1
    return n

def have_upper_words(text):
    for word in text.split():
        if len(word)>1 and word.isupper():
            return 1
    return 0

def number_big_words(text):
    n = 0
    for word in text[1:].split():
        if len(word)>1 and word[0].isupper():
            n+=1
    return n

In [6]:
def prepro(df):
    df.drop(columns='session',inplace=True)
    
    df['title'] = df['title'].str.split('\n', n=1).str[0]
    df['path'] = df['document_id'].str[:24]

    df['authors'] = df['authors'].apply(str_to_list)
    df['tags'] = df['tags'].apply(str_to_list)

    # столбцы с ff_ - это фичи
    df['ff_num_authors'] = df['authors'].apply(len)
    df['ff_num_tags'] = df['tags'].apply(len)

    encoders = my.load_pickle(dir_data+"encs.pcl")

    # кодируем в строковые последовательности
    df['authors'] = df['authors'].apply(
        encode_list_col, args=(encoders['authors_enc'],'a'))
    df['tags'] = df['tags'].apply(
        encode_list_col, args=(encoders['tags_enc'],'t'))
    

    map_encode = [('category', 'category_enc')]
    my.encode_cols(df, map_encode)

    df['ff_enc_authors'] = encoders["list_authors_enc"].transform(
        df['authors']).astype(np.int32) + 1

    df['ff_month'] = df['publish_date'].dt.month
    df['ff_hour'] = df['publish_date'].dt.hour
    df['ff_day'] = df['publish_date'].dt.day
    df['ff_day_week'] = df['publish_date'].dt.dayofweek
    df['ff_day_year'] = df['publish_date'].dt.dayofyear

    if 'views' in df.columns:
        # if train
        df = df.sort_values('publish_date').copy()

        # удаляем выбросы: статьи со слишком большими просмотрами, имеющими много дубликатов, с некорректным временем
        rules = (~df.title.str.contains('Какие места на Украине взяли')) & (df['publish_date'].dt.year == 2022) & (
            df['publish_date'].dt.dayofyear < 160) & (~df.views.isin([2554204, 518294, 2398050]))

        df = df[rules]
        df.drop_duplicates(subset=['title', 'views'], inplace=True)
        df = df.copy()

        # редкие категории добавляем к близким по смыслу
        df.loc[df.category == 7,'category'] = 1
        df.loc[df.category == 8,'category'] = 3
        df.loc[df.category == 9,'category'] = 6

        # срезаем выбросы в depth и full_reads_percent
        df.loc[df.depth >= 1.5,'depth'] = 1.5
        df.loc[df.full_reads_percent >= 67.,'full_reads_percent'] = 67.

        # делаем столбцы для фолдов на основе категорий
        df['folds_cat'] = df.category
        # df['folds_day'] = df['publish_date'].dt.dayofyear // 20
        df['folds_cat_and_day'] = df.category + (df['publish_date'].dt.dayofyear // 20) * 10
  

    df['ff_have_money'] = df.title.str.contains('[€£₣$₽]').astype(np.int8)
    df['ff_have_number'] = df.title.str.contains('\d').astype(np.int8)
    df['ff_have_quotes'] = df.title.str.contains('«').astype(np.int8)
    df['ff_have_pct'] = df.title.str.contains('%').astype(np.int8)

    df['ff_num_symbols'] = df.title.str.len()
    df['ff_num_words'] = df.title.apply(number_words)
    df['ff_have_upper_words'] = df.title.apply(have_upper_words)
    df['ff_number_big_words'] = df.title.apply(number_big_words)

    df['clean_lem_title'] = text_utils.clean_text(df["title"],do_lemma=True)
    df['clean_no_lem_title'] = text_utils.clean_text(df["title"])

    return df

In [7]:
train = prepro(train)
test = prepro(test)
train[:2]

Unnamed: 0,document_id,title,publish_date,authors,ctr,category,tags,views,depth,full_reads_percent,...,ff_have_money,ff_have_number,ff_have_quotes,ff_have_pct,ff_num_symbols,ff_num_words,ff_have_upper_words,ff_number_big_words,clean_lem_title,clean_no_lem_title
4092,61f401bf9a79478e78ad4...,Какие ограничения вво...,2022-01-29 06:00:22,a1 a44,1.284,6,t27 t483 t1575 t6297 ...,20337,1.17,19.039,...,0,0,1,0,67,8,0,0,ограничение вводить р...,какие ограничения вво...
5386,5f8017369a794784407eb...,Как в России развивае...,2022-01-29 09:00:53,a44 a59 a60,2.33,6,t27 t3494 t4860 t6297...,129984,1.292,40.749,...,0,0,0,0,52,6,1,3,россия развиваться эп...,россии развивается эп...


In [8]:
train.to_parquet(dir_data+'train_v1.p',index=False)
test.to_parquet(dir_data+'test_v1.p',index=False)

нахождение порядкового номера новости по отношению к другим новостям в течение дня

In [None]:
# train = pd.read_parquet(dir_data+'train_v1.p')
# test = pd.read_parquet(dir_data+'test_v1.p')

In [9]:
def get_all_news():
    cols = ['document_id','publish_date','category']
    train = pd.read_csv(dir_data+'train_dataset_train.csv',parse_dates=['publish_date'],usecols=cols)
    test = pd.read_csv(dir_data+'test_dataset_test.csv',parse_dates=['publish_date'],usecols=cols)
    return pd.concat([train,test]).sort_values('publish_date').reset_index(drop=True)
all_news =get_all_news()
p(all_news.shape)
all_news[:3]

(10000, 3) 
 ~


Unnamed: 0,document_id,publish_date,category
0,55c3984d9a79470f2c260...,2017-08-07 13:00:27,5433e5decbb20f277b20eca9
1,5a8eb3149a7947e3e34bb...,2018-02-22 14:28:30,5433e5decbb20f277b20eca9
2,605321259a7947663a3b6...,2021-04-12 11:01:26,5433e5decbb20f277b20eca9


In [10]:
all_news['ff_pos_in_day'] = all_news.groupby(all_news['publish_date'].dt.dayofyear)['publish_date'].rank().astype(np.int16)
all_news['ff_pos_in_cat_day'] = all_news.groupby([all_news['publish_date'].dt.dayofyear,'category'])['publish_date'].rank().astype(np.int16)
all_news = all_news.drop(columns=['publish_date','category']).set_index('document_id')
all_news[:3]

Unnamed: 0_level_0,ff_pos_in_day,ff_pos_in_cat_day
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1
55c3984d9a79470f2c260465BKmAn_fHQruT38beqDIE_A,1,1
5a8eb3149a7947e3e34bba2dbgzlzJ2GTQacZgtv4WEb1A,1,1
605321259a7947663a3b6ab2vgBnAMPBRIOYrUrK0JeEiQ,1,1


Нахождение количества новостей, вышедших через 30, 60, 120 ... минут после выхода текущей новости, так как из-за них новость опускается в списках на сайте, и получает меньше показов:

In [11]:
df_counts = get_all_news()
# переводим дату в секунды
df_counts['publish_date'] = df_counts['publish_date'].astype(int) / 10**9

counts_news = []
timings = np.array([30,60,120,300,720,1440]) * 60 # in seconds

for t in df_counts['publish_date']:

    count_one_news = []
    
    for lim in timings:
        ct = ((df_counts['publish_date'] > t) & (df_counts['publish_date'] < (t + lim))).sum()
        count_one_news.append(ct)

    counts_news.append(count_one_news)

counts_news = pd.DataFrame(counts_news,columns=['ff_' + f'count_after_{(lim/60)}' for lim in timings])
p(counts_news.shape)
counts_news[:10]

(10000, 6) 
 ~


Unnamed: 0,ff_count_after_30.0,ff_count_after_60.0,ff_count_after_120.0,ff_count_after_300.0,ff_count_after_720.0,ff_count_after_1440.0
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,0,0,0,0,0,0
7,0,0,1,1,1,1
8,0,0,0,0,0,0
9,0,0,0,1,1,1


In [15]:
counts_news.describe()

Unnamed: 0,ff_count_after_30.0,ff_count_after_60.0,ff_count_after_120.0,ff_count_after_300.0,ff_count_after_720.0,ff_count_after_1440.0
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2.546,5.291,10.78,26.599,58.37,114.987
std,1.819,2.959,5.196,11.437,20.384,28.627
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,3.0,7.0,17.0,42.0,98.0
50%,2.0,5.0,10.0,25.0,55.5,119.0
75%,4.0,7.0,14.0,35.0,74.0,133.0
max,11.0,18.0,31.0,63.0,120.0,179.0


так как есть дни по которым дано мало новостей, а на самом деле вряд ли были такие дни, когда на сайте было меньше 60 новостей, то присваиваем новостям в такие дни средние значения по датасету:

In [26]:
mean_counts = counts_news.loc[counts_news['ff_count_after_1440.0']>60].mean().to_list()
mean_counts

[2.573614937251301,
 5.344250586674829,
 10.892357922660953,
 26.879093970003062,
 59.04132231404959,
 116.60299969390879]

In [28]:
counts_news.loc[counts_news['ff_count_after_1440.0']<=60] = mean_counts
counts_news[:3]

Unnamed: 0,ff_count_after_30.0,ff_count_after_60.0,ff_count_after_120.0,ff_count_after_300.0,ff_count_after_720.0,ff_count_after_1440.0
0,2.574,5.344,10.892,26.879,59.041,116.603
1,2.574,5.344,10.892,26.879,59.041,116.603
2,2.574,5.344,10.892,26.879,59.041,116.603


In [31]:
counts_news.describe()

Unnamed: 0,ff_count_after_30.0,ff_count_after_60.0,ff_count_after_120.0,ff_count_after_300.0,ff_count_after_720.0,ff_count_after_1440.0
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2.574,5.344,10.892,26.879,59.041,116.603
std,1.803,2.926,5.122,11.235,19.734,26.104
min,0.0,0.0,0.0,5.0,20.0,61.0
25%,1.0,3.0,7.0,17.0,42.0,102.0
50%,2.0,5.0,10.0,26.0,57.0,119.0
75%,4.0,7.0,14.0,35.0,74.0,133.0
max,11.0,18.0,31.0,63.0,120.0,179.0


In [29]:
df_counts.index, counts_news.index

(RangeIndex(start=0, stop=10000, step=1),
 RangeIndex(start=0, stop=10000, step=1))

In [32]:
df_counts[:3]

Unnamed: 0,document_id,publish_date,category
0,55c3984d9a79470f2c260...,1502110827.0,5433e5decbb20f277b20eca9
1,5a8eb3149a7947e3e34bb...,1519309710.0,5433e5decbb20f277b20eca9
2,605321259a7947663a3b6...,1618225286.0,5433e5decbb20f277b20eca9


In [33]:
df_counts = pd.concat([df_counts[['document_id']],counts_news],axis=1).set_index('document_id')
df_counts[:3]

Unnamed: 0_level_0,ff_count_after_30.0,ff_count_after_60.0,ff_count_after_120.0,ff_count_after_300.0,ff_count_after_720.0,ff_count_after_1440.0
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
55c3984d9a79470f2c260465BKmAn_fHQruT38beqDIE_A,2.574,5.344,10.892,26.879,59.041,116.603
5a8eb3149a7947e3e34bba2dbgzlzJ2GTQacZgtv4WEb1A,2.574,5.344,10.892,26.879,59.041,116.603
605321259a7947663a3b6ab2vgBnAMPBRIOYrUrK0JeEiQ,2.574,5.344,10.892,26.879,59.041,116.603


In [34]:
print((all_news.index == df_counts.index).mean())
all_news = pd.concat([all_news,df_counts],axis=1)
all_news[:2]

1.0


Unnamed: 0_level_0,ff_pos_in_day,ff_pos_in_cat_day,ff_count_after_30.0,ff_count_after_60.0,ff_count_after_120.0,ff_count_after_300.0,ff_count_after_720.0,ff_count_after_1440.0
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
55c3984d9a79470f2c260465BKmAn_fHQruT38beqDIE_A,1,1,2.574,5.344,10.892,26.879,59.041,116.603
5a8eb3149a7947e3e34bba2dbgzlzJ2GTQacZgtv4WEb1A,1,1,2.574,5.344,10.892,26.879,59.041,116.603


In [35]:
all_news.describe()

Unnamed: 0,ff_pos_in_day,ff_pos_in_cat_day,ff_count_after_30.0,ff_count_after_60.0,ff_count_after_120.0,ff_count_after_300.0,ff_count_after_720.0,ff_count_after_1440.0
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,59.468,23.757,2.574,5.344,10.892,26.879,59.041,116.603
std,37.777,20.992,1.803,2.926,5.122,11.235,19.734,26.104
min,1.0,1.0,0.0,0.0,0.0,5.0,20.0,61.0
25%,28.0,6.0,1.0,3.0,7.0,17.0,42.0,102.0
50%,56.0,17.0,2.0,5.0,10.0,26.0,57.0,119.0
75%,88.0,38.0,4.0,7.0,14.0,35.0,74.0,133.0
max,171.0,113.0,11.0,18.0,31.0,63.0,120.0,179.0


In [36]:
train = train.merge(all_news,left_on='document_id',right_index=True).sort_values('publish_date')
p(train.shape)
train[:2]

(6937, 39) 
 ~


Unnamed: 0,document_id,title,publish_date,authors,ctr,category,tags,views,depth,full_reads_percent,...,clean_lem_title,clean_no_lem_title,ff_pos_in_day,ff_pos_in_cat_day,ff_count_after_30.0,ff_count_after_60.0,ff_count_after_120.0,ff_count_after_300.0,ff_count_after_720.0,ff_count_after_1440.0
4092,61f401bf9a79478e78ad4...,Какие ограничения вво...,2022-01-29 06:00:22,a1 a44,1.284,6,t27 t483 t1575 t6297 ...,20337,1.17,19.039,...,ограничение вводить р...,какие ограничения вво...,1,1,2.574,5.344,10.892,26.879,59.041,116.603
5386,5f8017369a794784407eb...,Как в России развивае...,2022-01-29 09:00:53,a44 a59 a60,2.33,6,t27 t3494 t4860 t6297...,129984,1.292,40.749,...,россия развиваться эп...,россии развивается эп...,2,2,2.574,5.344,10.892,26.879,59.041,116.603


In [37]:
temp = all_news.reindex(test['document_id']).reset_index(drop=True)
test = pd.concat([test,temp],axis=1)
p(test.shape)
test[:2]

(3000, 34) 
 ~


Unnamed: 0,document_id,title,publish_date,authors,ctr,category,tags,path,ff_num_authors,ff_num_tags,...,clean_lem_title,clean_no_lem_title,ff_pos_in_day,ff_pos_in_cat_day,ff_count_after_30.0,ff_count_after_60.0,ff_count_after_120.0,ff_count_after_300.0,ff_count_after_720.0,ff_count_after_1440.0
0,61f9569a9a794794245a8...,«Крайне провокационна...,2022-02-01 17:02:44,a100000,0.0,1,t36 t39 t116 t5455 t5524,61f9569a9a794794245a82ab,0,5,...,крайне провокационный...,крайне провокационная...,115,48,2.0,8.0,13.0,26.0,54.0,153.0
1,628c22b89a79470e553f5...,Власти Херсонской обл...,2022-05-24 00:50:55,a100000,1.598,1,t452 t869 t1589 t6449,628c22b89a79470e553f594b,0,4,...,власть херсонский обл...,власти херсонской обл...,2,2,1.0,1.0,5.0,15.0,75.0,143.0


In [38]:
train.to_parquet(dir_data+'train_v2.p',index=False)
test.to_parquet(dir_data+'test_v2.p',index=False)

Добавление данных, полученных парсингом страниц новостей

Дальше файл pages_info.p или берется готовый из папки data репозитория, или предварительно парсится в другом ноутбуке.

In [39]:
train = pd.read_parquet(dir_data+'train_v2.p')
test = pd.read_parquet(dir_data+'test_v2.p')

pages_info = pd.read_parquet(dir_data+'pages_info.p')
pages_info[:3]

Unnamed: 0_level_0,ww_number_links,ww_have_overview,ww_have_ticker,ww_len_overview,ww_num_symbols,ww_num_symbols_until_item,ww_pro_in_article,ww_num_inline_items,ww_have_iframe,ww_number_ps,ww_num_symbols_until_pro,ww_have_video_recommend,ww_have_video_in,ww_num_symbols_until_video_in,ww_have_image,text,ww_feat1,ww_feat2,ww_feat3
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
623e3fe49a7947150616c930,6,1,0,126,2133,956,1,1,0,10,1534,0,1,873,1,информация издание са...,0.448,0.719,0.409
61fb1d219a79473037413488,3,0,0,0,1303,600,1,1,0,8,1303,0,1,515,1,рефрижераторный судно...,0.46,0.999,0.395
628c71589a794723047f6d37,0,1,0,151,2657,2657,0,0,0,7,2657,0,1,844,1,защита алексей наваль...,1.0,1.0,0.318


In [40]:
train = train.merge(pages_info,left_on='path',right_index=True).sort_values('publish_date')
p(train.shape)
train[:2]

(6937, 58) 
 ~


Unnamed: 0,document_id,title,publish_date,authors,ctr,category,tags,views,depth,full_reads_percent,...,ww_number_ps,ww_num_symbols_until_pro,ww_have_video_recommend,ww_have_video_in,ww_num_symbols_until_video_in,ww_have_image,text,ww_feat1,ww_feat2,ww_feat3
0,61f401bf9a79478e78ad4...,Какие ограничения вво...,2022-01-29 06:00:22,a1 a44,1.284,6,t27 t483 t1575 t6297 ...,20337,1.17,19.039,...,36,2127,0,1,1349,1,заболеваемость covid ...,0.408,0.298,0.189
1,5f8017369a794784407eb...,Как в России развивае...,2022-01-29 09:00:53,a44 a59 a60,2.33,6,t27 t3494 t4860 t6297...,129984,1.292,40.749,...,2,452,0,0,10000,0,число выявить сутки с...,0.998,0.998,22.075


In [41]:
temp = pages_info.reindex(test['path']).reset_index(drop=True)
test = pd.concat([test,temp],axis=1)
p(test.shape)
test[:2]

(3000, 53) 
 ~


Unnamed: 0,document_id,title,publish_date,authors,ctr,category,tags,path,ff_num_authors,ff_num_tags,...,ww_number_ps,ww_num_symbols_until_pro,ww_have_video_recommend,ww_have_video_in,ww_num_symbols_until_video_in,ww_have_image,text,ww_feat1,ww_feat2,ww_feat3
0,61f9569a9a794794245a8...,«Крайне провокационна...,2022-02-01 17:02:44,a100000,0.0,1,t36 t39 t116 t5455 t5524,61f9569a9a794794245a82ab,0,5,...,3,876,0,0,10000,0,постпред россия оон в...,0.999,0.999,11.403
1,628c22b89a79470e553f5...,Власти Херсонской обл...,2022-05-24 00:50:55,a100000,1.598,1,t452 t869 t1589 t6449,628c22b89a79470e553f594b,0,4,...,9,2317,0,1,1380,1,русский стать основны...,0.456,0.765,0.456


In [42]:
train.to_parquet(dir_data+'train.p',index=False)
test.to_parquet(dir_data+'test.p',index=False)