## Загрузим нужные библиотеки

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

### Reproducibility block

In [2]:
# seed the RNG for all devices (both CPU and CUDA)
#torch.manual_seed(1984)

#Disabling the benchmarking feature causes cuDNN to deterministically select an algorithm, 
#possibly at the cost of reduced performance.
#torch.backends.cudnn.benchmark = False

# for custom operators,
import random
random.seed(5986721)

# 
np.random.seed(62225)

#sklearn take seed from a line abowe

Выполним загрузу датсета

In [3]:
DIR_DATA  = os.path.join(os.getcwd(), 'data')
#DIR_TRAIN = os.path.join(DIR_DATA, 'train')
#DIR_TEST  = os.path.join(DIR_DATA, 'test')
DIR_SUBM  = os.path.join(os.getcwd(), 'subm')

In [4]:
df_train = pd.read_csv(os.path.join(DIR_DATA, 'train.csv'), index_col= 0)
df_test = pd.read_csv(os.path.join(DIR_DATA, 'test.csv'), index_col= 0)

Заменим категорию и автора на число

In [5]:
df_train["category"] = df_train["category"].astype('category')
df_train["category"] = df_train["category"].cat.codes
df_train["category"] = df_train["category"].astype('int')

In [6]:
df_train["authors"] = df_train["authors"].astype('category')
df_train["authors"] = df_train["authors"].cat.codes
df_train["authors"] = df_train["authors"].astype('int')

In [7]:
df_train['day'] = pd.to_datetime(df_train['publish_date']).dt.strftime("%d").astype(int)
df_train['mounth'] = pd.to_datetime(df_train['publish_date']).dt.strftime("%m").astype(int)

Всего 9 категорий статей

In [8]:
df_train.category.value_counts()

0    3988
5    1456
3     667
1     338
4     283
2     265
7       1
6       1
8       1
Name: category, dtype: int64

## Выделим выборки

In [9]:
X = df_train.drop(["views","depth","full_reads_percent","title","publish_date", "session", "tags"], axis = 1)
y = df_train[["views","depth","full_reads_percent"]]

In [10]:
X.head()

Unnamed: 0_level_0,authors,ctr,category,day,mounth
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ,560,1.58,2,4,4
620f6b899a7947701cf489e1KtVJsteHStO5oditt3Uvzw,38,1.853,0,18,2
620730cf9a7947ab96a44e27hk7puWJwSziw0m3sfTkKWA,560,0.0,0,12,2
6262a5889a79470b78c9ca307UKY2SSZTjCcjhwBzxw37w,560,0.0,0,22,4
626678929a79477ca0101568wuMYES90REuV5YhrN75IXg,560,0.0,5,25,4


In [11]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Подбор модели

In [12]:
#regr = RandomForestRegressor(random_state=0)
regr = RandomForestRegressor()

Обучим модель

In [13]:
regr.fit(X_train, y_train)

RandomForestRegressor()

Предскажем значения

In [14]:
pred = regr.predict(X_test)

## Оценка точности

In [15]:
score_views = r2_score(y_test["views"], pred[:,0])
score_depth = r2_score(y_test["depth"], pred[:,1])
score_frp = r2_score(y_test["full_reads_percent"], pred[:,2])

In [16]:
score = 0.4 * score_views + 0.3 * score_depth + 0.3 * score_frp

score

0.5858316431480963

In [17]:
0.5548928782936524

0.5548928782936524

Предсказание для теста

In [29]:
df_test["category"] = df_test["category"].astype('category')
df_test["category"] = df_test["category"].cat.codes
df_test["category"] = df_test["category"].astype('int')

In [30]:
df_test["authors"] = df_test["authors"].astype('category')
df_test["authors"] = df_test["authors"].cat.codes
df_test["authors"] = df_test["authors"].astype('int')

In [31]:
df_test['day']    = pd.to_datetime(df_test['publish_date']).dt.strftime("%d").astype(int)
df_test['mounth'] = pd.to_datetime(df_test['publish_date']).dt.strftime("%m").astype(int)

In [36]:
X_pred = df_test.drop(["title","publish_date", "session", "tags"], axis = 1)

In [40]:
X_pred.head(3)

Unnamed: 0_level_0,authors,ctr,category,day,mounth
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
61f9569a9a794794245a82abJ0AvX96vTAaQCiWVbzoMdw,325,0.0,0,1,2
628c22b89a79470e553f594bQS5CqzXYRnmDdR2LaSreEw,325,1.598,0,24,5
627cb3249a7947ebdd752865XVsoyrUOT8OJJg2_finJhw,325,2.006,0,12,5


In [41]:
pred = regr.predict(X_pred)

submission

In [45]:
subm = pd.read_csv(os.path.join(DIR_SUBM, 'sample_solution.csv'))
subm.shape

(3000, 4)

In [46]:
subm.head()

Unnamed: 0,document_id,views,depth,full_reads_percent
0,61f9569a9a794794245a82abJ0AvX96vTAaQCiWVbzoMdw,1,1,1
1,628c22b89a79470e553f594bQS5CqzXYRnmDdR2LaSreEw,1,1,1
2,627cb3249a7947ebdd752865XVsoyrUOT8OJJg2_finJhw,1,1,1
3,628618629a7947d4927eb812upfii3whSSuMXCqcqF8VbQ,1,1,1
4,620e76109a7947235623695b5hzCiIHdSYKQIr8WAM18bw,1,1,1


In [53]:
subm.document_id = df_test.index
subm.views = pred[:,0]
subm.depth = pred[:,1]
subm.full_reads_percent = pred[:,2]


#y_test["views"], pred[:,0]
#y_test["depth"], pred[:,1]
#y_test["full_reads_percent"], pred[:,2]

In [54]:
subm.to_csv(os.path.join(DIR_SUBM, '0_baseline.csv'), index = False)