In [1]:
import pandas as pd
import numpy as np
import gc; gc.enable()

In [2]:
usecols = ['activation_date',
           'item_id',
           'region',
           'city',
           'parent_category_name',
           'category_name',
           'param_1',
           'param_2',
           'param_3',
           'title',
           'description',
           'price', 
           'item_seq_number']

In [3]:
ad_period_features = pd.read_csv('ad_period_features.csv', usecols=['item_id', 'activation_date', 'duration_3'], parse_dates=['activation_date'])
ad_period_features.head(3)

Unnamed: 0,item_id,activation_date,duration_3
0,0fceba64da0d,2017-01-01,2
1,e06329e1ce56,2017-01-01,2
2,0d3f55aa2786,2017-01-01,6


In [4]:
ad_period_features.loc[:, 'ad_total_count'] = \
    ad_period_features['item_id'].map(ad_period_features.groupby('item_id')['duration_3'].count())
    
ad_period_features.loc[:, 'days_up'] = \
    ad_period_features['item_id'].map(ad_period_features.groupby('item_id')['duration_3'].sum())

In [5]:
ad_period_features.drop_duplicates(['item_id'],inplace=True)

In [6]:
ad_period_features = ad_period_features.drop(['activation_date'], axis=1).set_index('item_id')
ad_period_features.head(3)

Unnamed: 0_level_0,duration_3,ad_total_count,days_up
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0fceba64da0d,2,2,12
e06329e1ce56,2,1,2
0d3f55aa2786,6,1,6


In [7]:
days_up_mean = ad_period_features.days_up.mean()
count_mean = ad_period_features.ad_total_count.mean()
days_up_mean, count_mean

(12.333354017369892, 1.7425203000067666)

In [8]:
def process_chunk(df):
    print('Original data shape', df.shape)
    df = df.loc[~df.activation_date.isnull(),:].reset_index(drop=True)
    df = df.loc[~df.item_id.isnull(),:].reset_index(drop=True)
    print('New data shape', df.shape)
    
    for c in ['region', 'city', 'parent_category_name', 'category_name', 'title', 'description', 'param_1', 'param_2', 'param_3']:
        df.loc[:, c] = df[c].fillna('unknown')
    
    df.loc[:, 'city'] = df['region'] + '_' + df['city']
    
    for c in ['price', 'item_seq_number']:
        df.loc[:, c] = df[c].fillna(-1)
        
    for col in ['title', 'description']:
        df[col+'_len'] = df[col].apply(len)
        df[col+'_word_cnt'] = df[col].apply(lambda x: len(x.split()) if x != 'unknown' else 0)
        df[col+'_unique_word_cnt'] = df[col].apply(lambda x: len(set(x.split())) if x != 'unknown' else 0)
        df[col+'_unqiue_percent'] = df[col+'_unique_word_cnt'] / df[col+'_word_cnt'] * 100.
        
    df.loc[:, 'days_up'] = df['item_id'].map(ad_period_features['days_up']).fillna(days_up_mean)
    df.loc[:, 'ad_total_count'] = df['item_id'].map(ad_period_features['ad_total_count']).fillna(count_mean)
    
    df.drop(['title', 'description', 'activation_date', 'item_id'], axis=1, inplace=True)
    
    return df

In [9]:
files = [
    'data/train.csv', 'data/test.csv', 'data/train_active.csv', 'data/test_active.csv'
]
all_df = None

for f in files:
    print('Processing file:', f)
    for chunk in pd.read_csv(f, usecols=usecols, chunksize=2000000, parse_dates=['activation_date']):
        if f == 'data/train.csv':
            chunk = chunk.sort_values('activation_date').reset_index(drop=True)
            print('Index reset!')
            
        chunk = process_chunk(chunk)
        gc.collect()
        if all_df is None:
            all_df = chunk
        else:
            all_df = pd.concat([all_df, chunk]).reset_index(drop=True)
            del chunk; gc.collect()

Processing file: data/train.csv
Index reset!
Original data shape (1503424, 13)
New data shape (1503424, 13)
Processing file: data/test.csv
Original data shape (508438, 13)
New data shape (508438, 13)
Processing file: data/train_active.csv
Original data shape (2000000, 13)
New data shape (1982514, 13)
Original data shape (2000000, 13)
New data shape (1982505, 13)
Original data shape (2000000, 13)
New data shape (1982159, 13)
Original data shape (2000000, 13)
New data shape (1982337, 13)
Original data shape (2000000, 13)
New data shape (1982509, 13)
Original data shape (2000000, 13)
New data shape (1982352, 13)
Original data shape (2000000, 13)
New data shape (1982461, 13)
Original data shape (129821, 13)
New data shape (128661, 13)
Processing file: data/test_active.csv
Original data shape (2000000, 13)
New data shape (1982305, 13)
Original data shape (2000000, 13)
New data shape (1981914, 13)
Original data shape (2000000, 13)
New data shape (1982109, 13)
Original data shape (2000000, 13

In [10]:
all_df.shape

(28726910, 19)

In [11]:
all_df.isnull().sum()

region                               0
city                                 0
parent_category_name                 0
category_name                        0
param_1                              0
param_2                              0
param_3                              0
price                                0
item_seq_number                      0
title_len                            0
title_word_cnt                       0
title_unique_word_cnt                0
title_unqiue_percent              1020
description_len                      0
description_word_cnt                 0
description_unique_word_cnt          0
description_unqiue_percent     1141141
days_up                              0
ad_total_count                       0
dtype: int64

In [12]:
all_df.loc[:, 'title_unqiue_percent'] = all_df['title_unqiue_percent'].fillna(0.)
all_df.loc[:, 'description_unqiue_percent'] = all_df['description_unqiue_percent'].fillna(0.)

In [13]:
all_df

Unnamed: 0,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,item_seq_number,title_len,title_word_cnt,title_unique_word_cnt,title_unqiue_percent,description_len,description_word_cnt,description_unique_word_cnt,description_unqiue_percent,days_up,ad_total_count
0,Самарская область,Самарская область_Самара,Личные вещи,Детская одежда и обувь,Для мальчиков,Верхняя одежда,86-92 см (1-2 года),500.0,12.0,17,2,2,100.0,28,4,4,100.000000,12.333354,1.74252
1,Иркутская область,Иркутская область_Братск,Бытовая электроника,Телефоны,Аксессуары,Чехлы и плёнки,unknown,100.0,5.0,19,4,4,100.0,15,2,2,100.000000,12.333354,1.74252
2,Пермский край,Пермский край_Краснокамск,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,Верхняя одежда,44–46 (M),1500.0,32.0,22,3,3,100.0,7,0,0,0.000000,12.333354,1.74252
3,Оренбургская область,Оренбургская область_Оренбург,Животные,Товары для животных,unknown,unknown,unknown,350.0,17.0,24,4,4,100.0,112,18,17,94.444444,12.333354,1.74252
4,Ярославская область,Ярославская область_Ярославль,Личные вещи,"Одежда, обувь, аксессуары",Мужская одежда,Другое,unknown,1500.0,7.0,24,3,3,100.0,60,10,10,100.000000,12.333354,1.74252
5,Иркутская область,Иркутская область_Иркутск,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,Верхняя одежда,46–48 (L),800.0,4.0,4,1,1,100.0,7,0,0,0.000000,12.333354,1.74252
6,Иркутская область,Иркутская область_Усть-Ордынский,Бытовая электроника,Телефоны,Samsung,unknown,unknown,399.0,71051.0,20,2,2,100.0,153,29,22,75.862069,12.333354,1.74252
7,Тульская область,Тульская область_Теплое,Недвижимость,Квартиры,Продам,1,Вторичка,950000.0,7.0,28,6,6,100.0,33,4,4,100.000000,12.333354,1.74252
8,Оренбургская область,Оренбургская область_Бузулук,Животные,Кошки,Другая,unknown,unknown,-1.0,19.0,13,2,2,100.0,48,8,8,100.000000,12.333354,1.74252
9,Башкортостан,Башкортостан_Нижнетроицкий,Для дома и дачи,Продукты питания,unknown,unknown,unknown,1.0,5.0,8,1,1,100.0,27,4,4,100.000000,12.333354,1.74252


In [14]:
from sklearn.preprocessing import LabelEncoder
for feature in ['region', 'city', 'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3']:
    print(f'Transforming {feature}...')
    encoder = LabelEncoder()
    all_df.loc[:, feature] = encoder.fit_transform(all_df[feature].astype(str))

Transforming region...
Transforming city...
Transforming parent_category_name...
Transforming category_name...
Transforming param_1...
Transforming param_2...
Transforming param_3...


In [16]:
all_df.days_up.value_counts()

19.000000    6569380
18.000000    3793028
6.000000     3104271
13.000000    2988521
12.333354    2011950
15.000000    1743718
17.000000    1642904
14.000000    1149528
16.000000     938920
12.000000     639050
11.000000     489844
7.000000      458120
8.000000      445642
1.000000      440074
10.000000     425869
9.000000      418766
2.000000      376037
5.000000      373865
3.000000      358097
4.000000      353788
0.000000        5538
Name: days_up, dtype: int64

In [17]:
all_df.ad_total_count.value_counts()

2.00000     11652555
1.00000      8304201
3.00000      5972047
1.74252      2011950
4.00000       781265
5.00000         3516
6.00000          465
13.00000         329
14.00000         182
9.00000          128
7.00000          104
8.00000           58
12.00000          33
15.00000          30
10.00000          25
11.00000          16
19.00000           4
16.00000           2
Name: ad_total_count, dtype: int64

In [18]:
train_len = pd.read_csv('data/train.csv', usecols=['region']).shape[0]
test_len = pd.read_csv('data/test.csv', usecols=['region']).shape[0]
train_len, test_len

(1503424, 508438)

In [19]:
import pickle

In [20]:
with open('data/active_test_train.pickle', 'wb') as handle:
    pickle.dump(all_df.loc[:train_len-1,:], handle)
    
with open('data/active_test_test.pickle', 'wb') as handle:
    pickle.dump(all_df.loc[train_len:train_len+test_len-1,:].reset_index(drop=True), handle)
    
with open('data/active_train.pickle', 'wb') as handle:
    pickle.dump(all_df.loc[train_len+test_len:,:].reset_index(drop=True), handle)