In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [31]:
data_train = pd.read_csv('kagle_train.csv', sep=',', parse_dates=['start_time'])
data_test = pd.read_csv('regr_test.csv', sep=',', parse_dates=['start_time'])

In [32]:
data_train.describe()

Unnamed: 0,id,price,item_id,item_views
count,376687.0,376687.0,376687.0,376687.0
mean,211784.700144,246549.8,4.606439e+18,3.483459
std,122355.459807,11261630.0,2.663233e+18,1.019757
min,0.0,0.0,66945950000000.0,0.0
25%,105786.5,500.0,2.296781e+18,2.70805
50%,211758.0,2000.0,4.606254e+18,3.367296
75%,317745.5,9990.0,6.912617e+18,4.094345
max,423771.0,5677568000.0,9.223325e+18,8.328693


In [33]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376687 entries, 0 to 376686
Data columns (total 13 columns):
id             376687 non-null int64
start_time     376687 non-null datetime64[ns]
title          376687 non-null object
price          376687 non-null int64
item_id        376687 non-null int64
owner_type     376687 non-null object
category       376687 non-null object
subcategory    376687 non-null object
param1         361456 non-null object
param2         196112 non-null object
param3         147746 non-null object
region         376687 non-null object
item_views     376687 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(3), object(8)
memory usage: 37.4+ MB


In [34]:
data_train = data_train.fillna('nan')
data_test = data_test.fillna('nan')

In [35]:
np.any(data_train.isna())

False

In [36]:
f = lambda x: (x.hour * 60 + x.minute) / (24 * 60)
data_train['relative_time'] = data_train.start_time.apply(f)
data_test['relative_time'] = data_test.start_time.apply(f)

In [38]:
data_train['time_to_end'] = data_train.start_time.apply(lambda x: 24 - x.hour)
data_test['time_to_end'] = data_test.start_time.apply(lambda x: 24 - x.hour)

In [39]:
data_train['slided_hours'] = data_train.start_time.apply(lambda x: ((x.hour - 3.5) % 24) * 1. / 24.)
data_test['slided_hours'] = data_test.start_time.apply(lambda x: ((x.hour - 3.5) % 24) * 1. / 24.)

In [40]:
def make_harmonic_features(value, period=24):
    value = (value / period) * 2 * np.pi
    return np.cos(value), np.sin(value)

In [41]:
data_train['cos_time'] = data_train.start_time.apply(lambda x: make_harmonic_features(x.hour + x.minute / 60)[0])
data_train['sin_time'] = data_train.start_time.apply(lambda x: make_harmonic_features(x.hour + x.minute / 60)[0])

data_test['cos_time'] = data_test.start_time.apply(lambda x: make_harmonic_features(x.hour + x.minute / 60)[0])
data_test['sin_time'] = data_test.start_time.apply(lambda x: make_harmonic_features(x.hour + x.minute / 60)[0])

In [42]:
data_train['week_day'] = data_train.start_time.apply(lambda x: x.isoweekday())
data_test['week_day'] = data_test.start_time.apply(lambda x: x.isoweekday())

In [44]:
data_train = data_train.drop(['start_time'], axis=1)
data_test = data_test.drop(['start_time'], axis=1)

##### текст

In [45]:
# !pip install pymorphy2

import re
import pymorphy2
from functools import lru_cache

In [46]:
morph = pymorphy2.MorphAnalyzer()

In [47]:
morph.parse('сладенькая')

[Parse(word='сладенькая', tag=OpencorporaTag('ADJF femn,sing,nomn'), normal_form='сладенький', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'сладенькая', 16, 7),))]

In [48]:
morph.get_lexeme(morph.parse('сладенькая')[0])

[Parse(word='сладенький', tag=OpencorporaTag('ADJF masc,sing,nomn'), normal_form='сладенький', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'сладенький', 16, 0),)),
 Parse(word='сладенького', tag=OpencorporaTag('ADJF masc,sing,gent'), normal_form='сладенький', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'сладенького', 16, 1),)),
 Parse(word='сладенькому', tag=OpencorporaTag('ADJF masc,sing,datv'), normal_form='сладенький', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'сладенькому', 16, 2),)),
 Parse(word='сладенького', tag=OpencorporaTag('ADJF anim,masc,sing,accs'), normal_form='сладенький', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'сладенького', 16, 3),)),
 Parse(word='сладенький', tag=OpencorporaTag('ADJF inan,masc,sing,accs'), normal_form='сладенький', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'сладенький', 16, 4),)),
 Parse(word='сладеньким', tag=OpencorporaTag('ADJF masc,sing,ablt'), normal_form='сладенький', score=1.0, methods_stack=((<DictionaryAnalyz

In [49]:
morph.normal_forms('чехлы'), morph.normal_forms('сладкая')

(['чехол'], ['сладкий'])

In [50]:
@lru_cache(maxsize=100000) # с кешом!
def get_normal_form (i):
    return morph.normal_forms(i)[0]

def normalize_text(x):
    return ' '.join([get_normal_form(i) for i in re.findall('\w+', x)])

In [51]:
normalize_text('Детская одежда и обувь')

'детский одежда и обувь'

In [52]:
from multiprocessing import Pool

In [53]:
%%time
with Pool(processes=4) as pool:
    data_train['title'] = pool.map(normalize_text, data_train.title)
    data_test['title'] = pool.map(normalize_text, data_test.title)
    pool.terminate()

CPU times: user 786 ms, sys: 346 ms, total: 1.13 s
Wall time: 36.3 s


##### Куча категориальных признаков

In [54]:
# конкатенация строк
data_train['all_params'] = data_train.param1 + ' ' + data_train.param2 + ' ' + data_train.param3
data_train['params_12'] = data_train.param1 + ' ' + data_train.param2
data_train['params_23'] = data_train.param2 + ' ' + data_train.param3
data_train['category_and_owner_type'] = data_train.owner_type + ' ' + data_train.category
data_train['category_and_region'] = data_train.region + ' ' + data_train.category
data_train['owner_type_and_region'] = data_train.owner_type + ' ' + data_train.category

data_test['all_params'] = data_test.param1 + ' ' + data_test.param2 + ' ' + data_test.param3
data_test['params_12'] = data_test.param1 + ' ' + data_test.param2
data_test['params_23'] = data_test.param2 + ' ' + data_test.param3
data_test['category_and_owner_type'] = data_test.owner_type + ' ' + data_test.category
data_test['category_and_region'] = data_test.region + ' ' + data_test.category
data_test['owner_type_and_region'] = data_test.owner_type + ' ' + data_test.category

str_cols = ['owner_type', 'title',
            'param1', 'param2', 'param3',
            'region', 'category' , 'subcategory', 
            'params_12', 'params_23', 'all_params',
            'category_and_owner_type',
            'category_and_region', 'owner_type_and_region'
           ]

In [55]:
pd.options.display.max_columns = 50

In [57]:
data_train.columns

Index(['id', 'title', 'price', 'item_id', 'owner_type', 'category',
       'subcategory', 'param1', 'param2', 'param3', 'region', 'item_views',
       'relative_time', 'time_to_end', 'slided_hours', 'cos_time', 'sin_time',
       'week_day', 'all_params', 'params_12', 'params_23',
       'category_and_owner_type', 'category_and_region',
       'owner_type_and_region'],
      dtype='object')

In [58]:
X_train, y_train = data_train.drop(['item_views', 'id', 'item_id'], axis=1), data_train['item_views'].values
X_test = data_test.drop(['id', 'item_id'], axis=1)

In [59]:
def _counter_encoder(dt, str_cols, counter_Encoders, is_train=True):
    """
    Counter Encoder.
    """
    if is_train:
        counter_Encoders = {col:dt[col].value_counts().to_dict() for col in str_cols}
        
    for column in str_cols:
        dt[column+'_enc_by_count'] = dt[column].apply(lambda x: counter_Encoders[column].get(x, 0))
        
    return counter_Encoders

In [60]:
counter_Encoders = _counter_encoder(X_train, str_cols=str_cols, counter_Encoders=None, is_train=True)

In [61]:
_ = _counter_encoder(X_test, str_cols=str_cols, counter_Encoders=counter_Encoders, is_train=False)

In [62]:
# выкинем не интересное

X_train = X_train.drop(str_cols, axis=1)
X_test = X_test.drop(str_cols, axis=1)

In [64]:
from sklearn.tree import DecisionTreeRegressor

In [65]:
regr = DecisionTreeRegressor()

In [66]:
regr.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [67]:
y_pred = regr.predict(X_test.values)

> 0.84451

In [71]:
pd.DataFrame({'id': data_test.id, 'item_views':y_pred}).to_csv('my_sample_pred.csv', index=None)