In [1]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import scipy
import lightgbm as lgb

In [2]:
gp = pd.read_csv('aggregated_features.csv')

In [3]:
train = pd.read_csv('data/train.csv', parse_dates=['activation_date'])
test = pd.read_csv('data/test.csv', parse_dates=['activation_date'])

train = train.merge(gp, on='user_id', how='left')
test = test.merge(gp, on='user_id', how='left')

agg_cols = list(gp.columns)[1:]

del gp; gc.collect()

train.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,...,activation_date,user_type,image,image_top_1,deal_probability,avg_days_up_user,avg_times_up_user,med_days_up_user,med_times_up_user,n_user_items
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),...,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789,8.0,2.0,8.0,2.0,2.0
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,...,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0,,,,,
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,...,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177,4.428571,1.142857,3.0,1.0,9.0
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,...,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323,16.714286,2.642857,18.0,3.0,32.0
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",...,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797,,,,,


In [4]:
train = train.sort_values(['activation_date']).reset_index(drop=True)

In [5]:
train_index, valid_index = train_test_split(np.arange(len(train)), test_size=0.1, random_state=519)

In [6]:
train.loc[train_index, 'activation_date'].dt.weekday.value_counts()

0    207048
6    205318
1    201131
2    196631
3    191293
4    176527
5    175133
Name: activation_date, dtype: int64

In [7]:
train.loc[valid_index, 'activation_date'].dt.weekday.value_counts()

0    23007
6    22614
1    22289
2    21884
3    21422
4    19599
5    19528
Name: activation_date, dtype: int64

In [8]:
test.loc[:, 'activation_date'].dt.weekday.value_counts()

2    81888
1    81114
0    80191
3    77177
4    70366
6    58909
5    58793
Name: activation_date, dtype: int64

One more thing about the approach that I haven't mentioned yet is that we will have quite some NaN values because not every ID in `train` and `test` occurs in `train_active` and `test_active`. Let's check how big that problem is.

In [9]:
train[agg_cols].isnull().any(axis=1).sum() / len(train) * 100

22.407185198586692

In [10]:
test[agg_cols].isnull().any(axis=1).sum() / len(test) * 100

24.354198545348694

In [11]:
train.isnull().sum(), test.isnull().sum()

(item_id                      0
 user_id                      0
 region                       0
 city                         0
 parent_category_name         0
 category_name                0
 param_1                  61576
 param_2                 654542
 param_3                 862565
 title                        0
 description             116276
 price                    85362
 item_seq_number              0
 activation_date              0
 user_type                    0
 image                   112588
 image_top_1             112588
 deal_probability             0
 avg_days_up_user        336875
 avg_times_up_user       336875
 med_days_up_user        336875
 med_times_up_user       336875
 n_user_items            336875
 dtype: int64, item_id                      0
 user_id                      0
 region                       0
 city                         0
 parent_category_name         0
 category_name                0
 param_1                  22910
 param_2                 2

We have missing features for 22.41% of train and 24.35% of test data. That's not perfect but certainly acceptable. Onto some more basic feature engineering with ideas from [a great kernel](https://www.kaggle.com/tunguz/bow-meta-text-and-dense-features-lb-0-2241?scriptVersionId=3603709).

In [12]:
import string

In [13]:
count = lambda l1,l2: sum([1 for x in l1 if x in l2])


for df in [train, test]:
    df['description'].fillna('unknowndescription', inplace=True)
    df['title'].fillna('unknowntitle', inplace=True)

    df['weekday'] = pd.to_datetime(df['activation_date']).dt.day
    
    for col in ['description', 'title']:
        df['num_words_' + col] = df[col].apply(lambda comment: len(comment.split()))
        df['num_unique_words_' + col] = df[col].apply(lambda comment: len(set(w for w in comment.split())))

    df['words_vs_unique_title'] = df['num_unique_words_title'] / df['num_words_title'] * 100
    df['words_vs_unique_description'] = df['num_unique_words_description'] / df['num_words_description'] * 100
    
    df['city'] = df['region'] + '_' + df['city']
    df['num_desc_punct'] = df['description'].apply(lambda x: count(x, set(string.punctuation)))
    
    for col in agg_cols:
        df[col].fillna(-1, inplace=True)
    
    for col in ['price', 'image_top_1']:
         df[col].fillna(-1, inplace=True)
    df.loc[df.price > 0, 'price'] = np.log1p(df.loc[df.price > 0, 'price'])
    
    for col in ['param_1', 'param_2', 'param_3']:
         df[col].fillna('khwinkaggle', inplace=True)

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khyeh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
train.head(3)

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,...,med_times_up_user,n_user_items,weekday,num_words_description,num_unique_words_description,num_words_title,num_unique_words_title,words_vs_unique_title,words_vs_unique_description,num_desc_punct
0,3b896605f03c,9457abfa0ebf,Самарская область,Самарская область_Самара,Личные вещи,Детская одежда и обувь,Для мальчиков,Верхняя одежда,86-92 см (1-2 года),Зимний комбенизон,...,1.5,5.0,15,4,4,2,2,100.0,100.0,0
1,c81c34d3aac5,0b161da2b144,Иркутская область,Иркутская область_Братск,Бытовая электроника,Телефоны,Аксессуары,Чехлы и плёнки,khwinkaggle,Чехол для айфона 6s,...,2.0,2.0,15,2,2,4,4,100.0,100.0,0
2,a8ab6225dd06,93d527a1ca66,Пермский край,Пермский край_Краснокамск,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,Верхняя одежда,44–46 (M),Продам кожанную куртку,...,-1.0,-1.0,15,1,1,3,3,100.0,100.0,0


In [16]:
target = 'deal_probability'
predictors = [
    'num_desc_punct', 
    'words_vs_unique_description', 'num_unique_words_description', 'num_unique_words_title', 'num_words_description', 'num_words_title',
    'avg_times_up_user', 'avg_days_up_user', 'n_user_items', 
    'price', 'item_seq_number'
]

categorical = [
    'image_top_1', 'param_1', 'param_2', 'param_3', 
    'city', 'region', 'category_name', 'parent_category_name', 'user_type'
]

predictors = predictors + categorical

## User Features

In [17]:
from copy import deepcopy as cp
col = 'user_id'
user_vc = train[col].append(test[col]).value_counts()
old_user_vc = cp(user_vc)
user_vc = user_vc[user_vc>30]
users = user_vc.index
user_filter = users.isin(train[col]) & users.isin(test[col])

print('Original feature types # = {}, new feature types # = {}'.format(
        len(old_user_vc), pd.Series(user_filter).value_counts()[1]))

print(train.loc[train[col].isin(users[user_filter]), col].value_counts().sort_index())
print(test.loc[test[col].isin(users[user_filter]), col].value_counts().sort_index())
print(train.loc[train[col].isin(users[user_filter]), :].shape[0]/train.shape[0]*100.)
print(test.loc[test[col].isin(users[user_filter]), :].shape[0]/test.shape[0]*100.)

Original feature types # = 1009909, new feature types # = 1206
00051c5a5f57     28
00179aa3c386     41
002fb45acb3f     16
004e5c8b2870     19
00a25ff400ad     35
00c5b94b7a0b     25
010f0729f73a     46
0153902752ed     12
017e9b28eda7     24
018ef9eebba0     16
0193cd5b7517     31
01950439a810     36
01adfb9f336f     45
01c73e264ee9     11
01d2db21def6     51
01dd42c9ca6d     40
02019da6857a     27
0201f4ceb3f6     79
03082624a937    396
0333e0767abf     54
03f616357c02     39
047d27fa4473    133
048095052e86     43
0492af69a4d3     28
04d0805871df     39
057526b08f9f     44
0582d1147bd9     17
0601d1e1b0bb     23
0615f14167f0     60
0665f480fd70     79
               ... 
f9cf1dabe1d0     80
fa0d3d9023cb    221
fab381155b7f     67
fbc9ad428501     40
fc0907cb4003     31
fc34d16e5103    130
fc39a0d34c08     55
fc7623fecf00     56
fca7a7a72a65     28
fcab302a2470     12
fcc94ec6517e      6
fcc9aea60220     33
fcda5accf7f5     30
fcdb2e335e4d     31
fcf1a1645f59     42
fd390da435b4     

In [18]:
train.loc[~train[col].isin(users[user_filter]), col] = 'others_users'
test.loc[~test[col].isin(users[user_filter]), col] = 'others_users'

In [19]:
gp = train.groupby('user_id')

In [20]:
for col in ['price', 'item_seq_number', 'num_desc_punct']:
    mean_mapping, std_mapping = gp[col].mean(), gp[col].std()
    
    train.loc[:, col+'_user_mean'] = train['user_id'].map(mean_mapping).fillna(0.)
    test.loc[:, col+'_user_mean'] = test['user_id'].map(mean_mapping).fillna(0.)
    
    train.loc[:, col+'_user_std'] = train['user_id'].map(std_mapping).fillna(0.)
    test.loc[:, col+'_user_std'] = test['user_id'].map(std_mapping).fillna(0.)
    
    predictors.extend([col+'_user_mean', col+'_user_std'])
    print(col+' processed')

price processed
item_seq_number processed
num_desc_punct processed


In [21]:
train.isnull().sum(), '\n', test.isnull().sum()

(item_id                              0
 user_id                              0
 region                               0
 city                                 0
 parent_category_name                 0
 category_name                        0
 param_1                              0
 param_2                              0
 param_3                              0
 title                                0
 description                          0
 price                                0
 item_seq_number                      0
 activation_date                      0
 user_type                            0
 image                           112588
 image_top_1                          0
 deal_probability                     0
 avg_days_up_user                     0
 avg_times_up_user                    0
 med_days_up_user                     0
 med_times_up_user                    0
 n_user_items                         0
 weekday                              0
 num_words_description                0


In [22]:
for col in categorical:
    type_cnt = gp[col].apply(lambda x: len(set(x)))
    train.loc[:, col+'_user_type_cnt'] = train['user_id'].map(type_cnt).fillna(0.)
    test.loc[:, col+'_user_type_cnt'] = test['user_id'].map(type_cnt).fillna(0.)
    predictors.append(col+'_user_type_cnt')
    print(col+' processed')

image_top_1 processed
param_1 processed
param_2 processed
param_3 processed
city processed
region processed
category_name processed
parent_category_name processed
user_type processed


In [23]:
del gp; gc.collect()

418

In [24]:
from sklearn.model_selection import KFold
fold_num = 10
kf = KFold(fold_num, shuffle=True, random_state=613451)

mean_enc_cols = ['user_id']

for col in mean_enc_cols:    
    
    train.loc[:, col+'_dp_mean_enc'] = np.zeros((train.shape[0],))
    train.loc[:, col+'_dp_std_enc'] = np.zeros((train.shape[0],))
    test.loc[:, col+'_dp_mean_enc'] = np.zeros((test.shape[0],))
    test.loc[:, col+'_dp_std_enc'] = np.zeros((test.shape[0],))
    
    for train_ix, val_ix in kf.split(train):
        tr_X = train.loc[train_ix, :]
    
        gp = tr_X.groupby(col)['deal_probability']
        mapping, mapping_std = gp.mean(), gp.std()
        
        train.loc[val_ix, col+'_dp_mean_enc'] = train.loc[val_ix, col].map(mapping).fillna(0.)
        train.loc[val_ix, col+'_dp_std_enc'] = train.loc[val_ix, col].map(mapping_std).fillna(0.)
        
        test.loc[:, col+'_dp_mean_enc'] += test.loc[:, col].map(mapping).fillna(0.)
        test.loc[:, col+'_dp_std_enc'] += test.loc[:, col].map(mapping_std).fillna(0.)
     
        del gp
        
    test.loc[:, col+'_dp_mean_enc'] /= fold_num
    test.loc[:, col+'_dp_std_enc'] /= fold_num
    print(col + ' processed.')
    predictors.extend([col+'_dp_mean_enc', col+'_dp_std_enc'])

user_id processed.


In [25]:
train.isnull().sum(), '\n', test.isnull().sum()

(item_id                                    0
 user_id                                    0
 region                                     0
 city                                       0
 parent_category_name                       0
 category_name                              0
 param_1                                    0
 param_2                                    0
 param_3                                    0
 title                                      0
 description                                0
 price                                      0
 item_seq_number                            0
 activation_date                            0
 user_type                                  0
 image                                 112588
 image_top_1                                0
 deal_probability                           0
 avg_days_up_user                           0
 avg_times_up_user                          0
 med_days_up_user                           0
 med_times_up_user                

## Text Features

In [26]:
count_vectorizer_title = CountVectorizer(stop_words=stopwords.words('russian'), lowercase=True, min_df=25)

title_counts = count_vectorizer_title.fit_transform(train['title'].append(test['title']))

train_title_counts = title_counts[:len(train)]
test_title_counts = title_counts[len(train):]


count_vectorizer_desc = TfidfVectorizer(stop_words=stopwords.words('russian'), 
                                        lowercase=True, ngram_range=(1, 2),
                                        max_features=15000)

desc_counts = count_vectorizer_desc.fit_transform(train['description'].append(test['description']))

train_desc_counts = desc_counts[:len(train)]
test_desc_counts = desc_counts[len(train):]

train_title_counts.shape, train_desc_counts.shape

((1503424, 16447), (1503424, 15000))

## Encode Categorical Features

In [27]:
for feature in categorical:
    print(f'Transforming {feature}...')
    encoder = LabelEncoder()
    encoder.fit(train[feature].append(test[feature]).astype(str))
    
    train[feature] = encoder.transform(train[feature].astype(str))
    test[feature] = encoder.transform(test[feature].astype(str))

Transforming image_top_1...
Transforming param_1...
Transforming param_2...
Transforming param_3...
Transforming city...
Transforming region...
Transforming category_name...
Transforming parent_category_name...
Transforming user_type...


## Mean Encodings

In [28]:
for feature in categorical:
    gp = train.groupby(feature)[feature].count()
    train.loc[:, feature+'_cnt'] = train[feature].map(gp).fillna(0.)
    test.loc[:, feature+'_cnt'] = test[feature].map(gp).fillna(0.)
    predictors.append(feature+'_cnt')

In [29]:
from sklearn.model_selection import KFold
fold_num = 10
kf = KFold(fold_num, shuffle=True, random_state=411519719)

mean_enc_cols = ['item_seq_number']
mean_enc_cols.extend(categorical)

for col in mean_enc_cols:    
    
    train.loc[:, col+'_dp_mean_enc'] = np.zeros((train.shape[0],))
    train.loc[:, col+'_dp_std_enc'] = np.zeros((train.shape[0],))
    test.loc[:, col+'_dp_mean_enc'] = np.zeros((test.shape[0],))
    test.loc[:, col+'_dp_std_enc'] = np.zeros((test.shape[0],))
    
    for train_ix, val_ix in kf.split(train):
        tr_X = train.loc[train_ix, :]
    
        gp = tr_X.groupby(col)['deal_probability']
        mapping, mapping_std = gp.mean(), gp.std()
        
        train.loc[val_ix, col+'_dp_mean_enc'] = train.loc[val_ix, col].map(mapping).fillna(0.)
        train.loc[val_ix, col+'_dp_std_enc'] = train.loc[val_ix, col].map(mapping_std).fillna(0.)
        
        test.loc[:, col+'_dp_mean_enc'] += test.loc[:, col].map(mapping).fillna(0.)
        test.loc[:, col+'_dp_std_enc'] += test.loc[:, col].map(mapping_std).fillna(0.)
     
        del gp
        
    test.loc[:, col+'_dp_mean_enc'] /= fold_num
    test.loc[:, col+'_dp_std_enc'] /= fold_num
    print(col + ' processed.')
    predictors.extend([col+'_dp_mean_enc', col+'_dp_std_enc'])

item_seq_number processed.
image_top_1 processed.
param_1 processed.
param_2 processed.
param_3 processed.
city processed.
region processed.
category_name processed.
parent_category_name processed.
user_type processed.


In [30]:
train.isnull().sum(), '\n', test.isnull().sum()

(item_id                                  0
 user_id                                  0
 region                                   0
 city                                     0
 parent_category_name                     0
 category_name                            0
 param_1                                  0
 param_2                                  0
 param_3                                  0
 title                                    0
 description                              0
 price                                    0
 item_seq_number                          0
 activation_date                          0
 user_type                                0
 image                               112588
 image_top_1                              0
 deal_probability                         0
 avg_days_up_user                         0
 avg_times_up_user                        0
 med_days_up_user                         0
 med_times_up_user                        0
 n_user_items                   

In [31]:
#train.drop(['image_top_1'], axis=1, inplace=True)
#test.drop(['image_top_1'], axis=1, inplace=True)

In [32]:
x_test = scipy.sparse.hstack([
    test_desc_counts,
    test_title_counts,
    test.loc[:, predictors]
], format='csr')

In [33]:
x_train = scipy.sparse.hstack([
        train_desc_counts,
        train_title_counts,
        train.loc[:, predictors]
    ], format='csr')
y_train = train.loc[:, target].values

In [34]:
import pickle

with open('train_features', 'wb') as handle:
    pickle.dump(x_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('test_features', 'wb') as handle:
    pickle.dump(x_test, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('train_text_features', 'wb') as handle:
    pickle.dump(train_desc_counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('test_text_features', 'wb') as handle:
    pickle.dump(test_desc_counts, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
norm_cols = [col for col in predictors if col not in categorical]
ohe_cols = categorical

norm_cols, ohe_cols

(['num_desc_punct',
  'words_vs_unique_description',
  'num_unique_words_description',
  'num_unique_words_title',
  'num_words_description',
  'num_words_title',
  'avg_times_up_user',
  'avg_days_up_user',
  'n_user_items',
  'price',
  'item_seq_number',
  'price_user_mean',
  'price_user_std',
  'item_seq_number_user_mean',
  'item_seq_number_user_std',
  'num_desc_punct_user_mean',
  'num_desc_punct_user_std',
  'image_top_1_user_type_cnt',
  'param_1_user_type_cnt',
  'param_2_user_type_cnt',
  'param_3_user_type_cnt',
  'city_user_type_cnt',
  'region_user_type_cnt',
  'category_name_user_type_cnt',
  'parent_category_name_user_type_cnt',
  'user_type_user_type_cnt',
  'user_id_dp_mean_enc',
  'user_id_dp_std_enc',
  'image_top_1_cnt',
  'param_1_cnt',
  'param_2_cnt',
  'param_3_cnt',
  'city_cnt',
  'region_cnt',
  'category_name_cnt',
  'parent_category_name_cnt',
  'user_type_cnt',
  'item_seq_number_dp_mean_enc',
  'item_seq_number_dp_std_enc',
  'image_top_1_dp_mean_enc',


In [36]:
all_df = pd.concat([train.loc[:, predictors], test.loc[:, predictors]]).reset_index(drop=True)

In [37]:
all_df.isnull().sum().max()

0

In [38]:
from scipy.stats import boxcox

for c in norm_cols:
    val = all_df.loc[all_df[c]>=0, c]
    
    val = (val-val.min())/(val.max()-val.min())
    val, ld = boxcox(val+1e-15)
    all_df.loc[all_df[c]>=0, c] = (val-val.min())/(val.max()-val.min())
    all_df.loc[all_df[c]<0, c] = -1
    
    print('Column {} transformed with optimal lambda value={}'.format(c, ld))
    del val; gc.collect()

Column num_desc_punct transformed with optimal lambda value=0.16754973532037323
Column words_vs_unique_description transformed with optimal lambda value=15.273487501374978
Column num_unique_words_description transformed with optimal lambda value=0.21742276934084376
Column num_unique_words_title transformed with optimal lambda value=0.17827997754465894
Column num_words_description transformed with optimal lambda value=0.20976086495103724
Column num_words_title transformed with optimal lambda value=0.17868148305068648
Column avg_times_up_user transformed with optimal lambda value=0.10044560216572643
Column avg_days_up_user transformed with optimal lambda value=1.0193498930825309
Column n_user_items transformed with optimal lambda value=0.14118194110456056
Column price transformed with optimal lambda value=0.5225520525538597
Column item_seq_number transformed with optimal lambda value=0.12441496162322882
Column price_user_mean transformed with optimal lambda value=0.19602598325930568
Colu

In [39]:
column_filt = all_df.loc[:, predictors].columns.isin(ohe_cols)
pd.Series(column_filt).value_counts()

False    57
True      9
dtype: int64

In [40]:
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder
ohe_features = csr_matrix(OneHotEncoder(categorical_features=column_filt).fit_transform(all_df.loc[:, predictors]))
del all_df; gc.collect()

48

In [41]:
x_train_ohe_norm = scipy.sparse.hstack([
                        train_desc_counts,
                        train_title_counts,
                        ohe_features[:train.shape[0], :]
                    ], format='csr')

In [42]:
x_test_ohe_norm = scipy.sparse.hstack([
                        test_desc_counts,
                        test_title_counts,
                        ohe_features[train.shape[0]:, :]
                    ], format='csr')

In [43]:
with open('train_ohe_norm_features', 'wb') as handle:
    pickle.dump(x_train_ohe_norm, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('test_ohe_norm_features', 'wb') as handle:
    pickle.dump(x_test_ohe_norm, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [44]:
feature_names = np.hstack([
    count_vectorizer_desc.get_feature_names(),
    count_vectorizer_title.get_feature_names(),
    predictors
])

In [45]:
list(feature_names)

['00',
 '00 16',
 '00 17',
 '00 18',
 '00 19',
 '00 20',
 '00 21',
 '00 22',
 '00 23',
 '00 адресу',
 '00 вс',
 '00 выходные',
 '00 выходных',
 '00 обеда',
 '00 перерыва',
 '00 руб',
 '00 сб',
 '00 час',
 '000',
 '000 000',
 '000 км',
 '000 руб',
 '000 рублей',
 '000р',
 '000руб',
 '001',
 '01',
 '02',
 '02 2017',
 '03',
 '03 17',
 '03 2017',
 '04',
 '04 2017',
 '05',
 '06',
 '07',
 '08',
 '08 00',
 '09',
 '09 00',
 '10',
 '10 00',
 '10 000',
 '10 10',
 '10 100',
 '10 11',
 '10 12',
 '10 15',
 '10 18',
 '10 19',
 '10 20',
 '10 дней',
 '10 дюймов',
 '10 кв',
 '10 кг',
 '10 км',
 '10 лет',
 '10 месяцев',
 '10 метров',
 '10 мин',
 '10 минут',
 '10 минутах',
 '10 мм',
 '10 руб',
 '10 рублей',
 '10 см',
 '10 соток',
 '10 ти',
 '10 тыс',
 '10 шт',
 '10 штук',
 '10 этажного',
 '100',
 '100 000',
 '100 150',
 '100 гарантия',
 '100 гр',
 '100 кв',
 '100 кг',
 '100 км',
 '100 мбит',
 '100 метрах',
 '100 метров',
 '100 мл',
 '100 мм',
 '100 мод',
 '100 оригинал',
 '100 полиэстер',
 '100 руб',
 '1

In [46]:
with open('feature_names', 'wb') as handle:
    pickle.dump(feature_names, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('cat_feature_names', 'wb') as handle:
    pickle.dump(categorical, handle, protocol=pickle.HIGHEST_PROTOCOL)