In [1]:
import pandas as pd
import time
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
import itertools
import pickle
from copy import deepcopy
from collections import Counter
from tqdm import tqdm

from myutils import timer, reduce_mem_usage

categorical = ["user_id","region","city","parent_category_name","category_name","user_type","param_1","param_2","param_3"]
categorical_ex = categorical + ['param_123', 'weekofday']
aggfeats = ['region', 'city', 'parent_category_name', 'category_name', 'user_type', 'weekofday', 'param_1', 'param_123']
nonaggfeats = list(set(categorical_ex) - set(aggfeats))

lentrain = 1503424
lentest = 508438
lentrainactive = 14129821
lentestactive = 12824068

In [2]:
def tcount(x):
    return x.value_counts().index[0]    
    
def count(df, group_cols, suffix='numcount', agg_type='uint32'):
    aggname = '_'.join(group_cols) + '_' + suffix
    gp = df[group_cols].groupby(group_cols).size().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].astype(agg_type)
    del gp; gc.collect()
    return df

def uniquecount(df, group_cols, target, suffix='numunique', agg_type='uint32'):
    aggname = '_'.join(group_cols) + '_'+ target + '_' + suffix
    gp = df[group_cols+[target]].groupby(group_cols).agg({target:'nunique'}).reset_index().rename(columns={target:aggname})
    #gp = df[group_cols+[target]].groupby(group_cols)[target].nunique().reset_index().rename(columns={target:aggname})
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].astype(agg_type)
    del gp; gc.collect()
    return df

def cumuratecount( df, group_cols, target, suffix='cumcount', agg_type='uint32'):
    aggname = '_'.join(group_cols) + '_'+ target + '_' + suffix
    gp = df[group_cols+[target]].groupby(group_cols).agg({target: 'cumcount'})
    #gp = df[group_cols+[target]].groupby(group_cols)[target].cumcount()
    df[aggname] = gp.values
    df[aggname] = df[aggname].astype(agg_type)
    del gp; gc.collect()
    return df

def topcount(df, group_cols, target, suffix='topcount'):
    aggname = '_'.join(group_cols) + '_'+ target + '_' + suffix
    gp = df[group_cols+[target]].groupby(group_cols).agg({target: tcount}).reset_index().rename(columns={target:aggname})
    #gp = df[group_cols+[target]].groupby(group_cols).agg(lambda x: x.value_counts().index[0]).reset_index().rename(columns={target:aggname})
    df = df.merge(gp, on=group_cols, how='left')
    del gp; gc.collect()
    return df

def mean(df, group_cols, target, suffix='mean', agg_type='float32'):
    aggname = '_'.join(group_cols) + '_'+ target + '_' + suffix
    gp = df[group_cols+[target]].groupby(group_cols).agg({target: 'mean'}).reset_index().rename(columns={target:aggname})
    #gp = df[group_cols+[target]].groupby(group_cols)[target].mean().reset_index().rename(columns={target:aggname})
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].astype(agg_type)
    del gp; gc.collect()
    return df

def var(df, group_cols, target, suffix='var', agg_type='float32'):
    aggname = '_'.join(group_cols) + '_'+ target + '_' + suffix
    gp = df[group_cols+[target]].groupby(group_cols).agg({target: 'var'}).reset_index().rename(columns={target:aggname})
    #gp = df[group_cols+[target]].groupby(group_cols)[target].var().reset_index().rename(columns={target:aggname})
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].astype(agg_type)
    del gp; gc.collect()
    return df

def aggtransform(df, group_cols, targets):
    suffix = ['numunique', 'topcount', 'mean', 'var']
    #aggtypes = ['uint32', '', 'float32', 'float32']
    aggfunc = ['nunique', tcount, 'mean', 'var']
    aggfuncstr = ['nunique', 'tcount', 'mean', 'var']
    
    aggarg = {tar: aggfunc for tar in targets}
    aggnamekeys = []
    for tar in targets:
        for agf in aggfuncstr:
            aggnamekeys.append((tar, agf))
    aggnames = {(tar, agf): '_'.join(group_cols) + '_'+ tar + '_' + agf for tar, agf in aggnamekeys}

    gp = df[group_cols+targets].groupby(group_cols).agg(aggarg).reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df = df.rename(columns=aggnames)
    del gp; gc.collect()
    return df


In [2]:
train = pd.read_csv('../input/train.csv', usecols=categorical+['activation_date'], parse_dates=['activation_date'])
test = pd.read_csv('../input/test.csv', usecols=categorical+['activation_date'], parse_dates=['activation_date'])
train_active = pd.read_csv('../input/train_active.csv', usecols=categorical+['activation_date'], parse_dates=['activation_date'])
test_active = pd.read_csv('../input/test_active.csv', usecols=categorical+['activation_date'], parse_dates=['activation_date'])

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

df = pd.concat([train, test, train_active, test_active])
print(df.shape)
del train, test, train_active, test_active; gc.collect()
df.head()

(1503424, 10)
(508438, 10)
(14129821, 10)
(12824068, 10)
(28965751, 10)


Unnamed: 0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,activation_date,user_type
0,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,2017-03-28,Private
1,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,2017-03-26,Private
2,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,2017-03-20,Private
3,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,2017-03-25,Company
4,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,2017-03-16,Private


In [3]:
for col in categorical:
    df[col] = df[col].fillna('missing')
    
df['param_123'] = (df['param_1']+' '+df['param_2']+' '+df['param_3']).astype(str)
df.head()

Unnamed: 0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,activation_date,user_type,param_123
0,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,missing,missing,2017-03-28,Private,Постельные принадлежности missing missing
1,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,missing,missing,2017-03-26,Private,Другое missing missing
2,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",missing,missing,2017-03-20,Private,"Видео, DVD и Blu-ray плееры missing missing"
3,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,missing,missing,2017-03-25,Company,Автомобильные кресла missing missing
4,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110,2017-03-16,Private,С пробегом ВАЗ (LADA) 2110


In [4]:
for col in categorical+['param_123']:
    df[col] = LabelEncoder().fit_transform(df[col])
df.head()

Unnamed: 0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,activation_date,user_type,param_123
0,3817460,21,472,6,43,252,134,1654,2017-03-28,1,1013
1,983203,19,1340,4,23,123,134,1654,2017-03-26,1,309
2,2487782,18,1315,2,3,85,134,1654,2017-03-20,1,104
3,3261852,23,973,6,43,38,134,1654,2017-03-25,0,42
4,4077855,6,326,8,1,281,146,63,2017-03-16,1,2893


In [5]:
df['weekofday'] = df['activation_date'].dt.weekday
#print(Counter(df['weekofday']))

df.drop(['activation_date'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,user_type,param_123,weekofday
0,3817460,21,472,6,43,252,134,1654,1,1013,1.0
1,983203,19,1340,4,23,123,134,1654,1,309,6.0
2,2487782,18,1315,2,3,85,134,1654,1,104,0.0
3,3261852,23,973,6,43,38,134,1654,0,42,5.0
4,4077855,6,326,8,1,281,146,63,1,2893,3.0


In [7]:
df = reduce_mem_usage(df)
#train = df[:lentrain]
#test = df[lentrain:lentrain+lentest]
train_active = df[lentrain+lentest: lentrain+lentest+lentrainactive]
test_active = df[lentrain+lentest+lentrainactive:lentrain+lentest+lentrainactive+lentestactive]
#print(train.shape)
#print(test.shape)
print(train_active.shape)
print(test_active.shape)



Memory usage of dataframe is 2651.89 MB
Memory usage after optimization is: 828.72 MB
Decreased by 68.8%
(14129821, 11)
(12824068, 11)


In [8]:
#train.to_feather('../features/train/categorical_features_train.feather')
#test.to_feather('../features/test/categorical_features_test.feather')
train_active.to_feather('../features/extra/categorical_features_train_active.feather')
test_active.to_feather('../features/extra/categorical_features_test_active.feather')

print('done')

done


In [37]:
with timer('Agg by Count'):
    for i in range(1, 5):
        train = pd.read_feather('../features/train/categorical_features_train.feather')
        test = pd.read_feather('../features/test/categorical_features_test.feather')
        print(train.shape)
        print(test.shape)
        df = pd.concat([train, test])
        df.drop(nonaggfeats, axis=1, inplace=True)
        del train, test; gc.collect()
        print(df.shape)
        
        for comb in tqdm(list(itertools.combinations(aggfeats, i))):
            df = count(df, list(comb))
            gc.collect()
        df.drop(aggfeats, axis=1, inplace=True)
        df = reduce_mem_usage(df)
        train = df[:lentrain]
        test = df[lentrain:]
        train.reset_index(drop=True, inplace=True)
        test.reset_index(drop=True, inplace=True)
        print(train.shape)
        print(test.shape)
        train.to_feather('../features/train/Single_agg_'+str(i)+'_train.feather')
        test.to_feather('../features/test/Single_agg_'+str(i)+'_test.feather')
        print('save done ',str(i))
        del train, test, df; gc.collect()


(1503424, 11)
(508438, 11)
(2011862, 8)


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.32it/s]


Memory usage of dataframe is 76.75 MB
Memory usage after optimization is: 76.75 MB
Decreased by 0.0%
(1503424, 8)
(508438, 8)
save done  1
(1503424, 11)
(508438, 11)
(2011862, 8)


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:19<00:00,  1.45it/s]


Memory usage of dataframe is 230.24 MB
Memory usage after optimization is: 168.84 MB
Decreased by 26.7%
(1503424, 28)
(508438, 28)
save done  2
(1503424, 11)
(508438, 11)
(2011862, 8)


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [00:55<00:00,  1.01it/s]


Memory usage of dataframe is 445.13 MB
Memory usage after optimization is: 249.43 MB
Decreased by 44.0%
(1503424, 56)
(508438, 56)
save done  3
(1503424, 11)
(508438, 11)
(2011862, 8)


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:25<00:00,  1.22s/it]


Memory usage of dataframe is 552.57 MB
Memory usage after optimization is: 287.80 MB
Decreased by 47.9%
(1503424, 70)
(508438, 70)
save done  4
[Agg by Count] done in 183 s


In [3]:
with open('./Golden_Count_aggs.pickle', 'rb') as f:
    count_agg_list = pickle.load(f)
    
print(count_agg_list)

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
train_active = pd.read_feather('../features/train_active/categorical_features_train_active.feather')
test_active = pd.read_feather('../features/test_active/categorical_features_test_active.feather')
print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

df = pd.concat([train, test, train_active, test_active])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for group_cols in tqdm(count_agg_list):
    aggname = '-'.join(group_cols) + '_numcount'
    gp = df[group_cols].groupby(group_cols).size().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(df[aggname].max())
    df[aggname] = df[aggname].astype('uint32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats, axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]
train_active = df[lentrain+lentest: lentrain+lentest+lentrainactive]
test_active = df[lentrain+lentest+lentrainactive: lentrain+lentest+lentrainactive+lentestactive]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_active.reset_index(drop=True, inplace=True)
test_active.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

train.to_feather('../features/train/Agg_count_Golden_features_train.feather')
test.to_feather('../features/test/Agg_count_Golden_features_test.feather')
train_active.to_feather('../features/train_active/Agg_count_Golden_features_train_active.feather')
test_active.to_feather('../features/test_active/Agg_count_Golden_features_test_active.feather')

print('done')

[['region', 'category_name', 'user_type', 'weekofday'], ['parent_category_name', 'param_123'], ['region', 'category_name'], ['param_123'], ['category_name'], ['category_name', 'param_1'], ['parent_category_name', 'user_type', 'weekofday'], ['parent_category_name', 'weekofday'], ['city'], ['city', 'parent_category_name', 'user_type', 'weekofday'], ['parent_category_name', 'category_name', 'weekofday'], ['user_type', 'param_1'], ['user_type'], ['user_type', 'param_123'], ['category_name', 'user_type'], ['parent_category_name'], ['region', 'parent_category_name'], ['city', 'user_type', 'weekofday', 'param_123'], ['parent_category_name', 'category_name'], ['category_name', 'weekofday'], ['weekofday', 'param_1'], ['parent_category_name', 'category_name', 'user_type'], ['parent_category_name', 'param_1'], ['param_1'], ['parent_category_name', 'user_type'], ['city', 'weekofday'], ['region', 'parent_category_name', 'weekofday'], ['region', 'parent_category_name', 'user_type', 'weekofday'], ['c

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [09:08<00:00, 15.66s/it]


(28965751, 43)
Memory usage of dataframe is 4475.07 MB
Memory usage after optimization is: 4419.82 MB
Decreased by 1.2%
(1503424, 35)
(508438, 35)
(14129821, 35)
(12824068, 35)
done


In [4]:
with open('./Golden_Uniqueount_aggs.pickle', 'rb') as f:
    count_agg_list = pickle.load(f)
    
print(count_agg_list)

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
train_active = pd.read_feather('../features/train_active/categorical_features_train_active.feather')
test_active = pd.read_feather('../features/test_active/categorical_features_test_active.feather')
print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

df = pd.concat([train, test, train_active, test_active])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for cols in tqdm(count_agg_list):
    group_cols = cols[: -1]
    target = cols[-1]
    print(group_cols, target, cols)
    aggname = '-'.join(cols) + '_numunique'
    gp = df[cols].groupby(cols)[target].nunique().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(df[aggname].max())
    df[aggname] = df[aggname].astype('uint32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats, axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]
train_active = df[lentrain+lentest: lentrain+lentest+lentrainactive]
train_active = df[lentrain+lentest+lentrainactive: lentrain+lentest+lentrainactive+lentestactive]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_active.reset_index(drop=True, inplace=True)
test_active.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

train.to_feather('../features/train/Agg_numunique_Golden_features_train.feather')
test.to_feather('../features/test/Agg_numunique_Golden_features_test.feather')
train_active.to_feather('../features/train_active/Agg_numunique_Golden_features_train_active.feather')
test_active.to_feather('../features/test_active/Agg_numunique_Golden_features_test_active.feather')

print('done')

[['region', 'category_name', 'param_123', 'city'], ['region', 'category_name', 'param_123'], ['parent_category_name', 'category_name', 'user_type', 'city'], ['parent_category_name', 'user_type', 'city'], ['param_123', 'city'], ['city', 'parent_category_name', 'user_type', 'param_123'], ['parent_category_name', 'user_type', 'category_name'], ['city', 'category_name', 'user_type', 'param_123'], ['region', 'category_name', 'user_type', 'param_123'], ['parent_category_name', 'category_name', 'param_123', 'city'], ['user_type', 'param_123', 'city'], ['region', 'city', 'user_type', 'param_123'], ['category_name', 'param_123'], ['region', 'category_name', 'city'], ['city', 'param_123'], ['region', 'user_type', 'param_123', 'city'], ['parent_category_name', 'user_type', 'param_123', 'city'], ['parent_category_name', 'category_name', 'user_type', 'param_123'], ['region', 'parent_category_name', 'user_type', 'category_name'], ['category_name', 'user_type', 'param_123', 'city'], ['region', 'paren

  0%|                                                                                           | 0/33 [00:00<?, ?it/s]

['region', 'category_name', 'param_123'] city ['region', 'category_name', 'param_123', 'city']


  3%|██▍                                                                             | 1/33 [15:02<8:01:29, 902.80s/it]

['region', 'category_name'] param_123 ['region', 'category_name', 'param_123']


MemoryError: 

In [5]:
target = 'price'
for i in range(1, 5):
    train = pd.read_feather('../features/train/categorical_features_train.feather')
    test = pd.read_feather('../features/test/categorical_features_test.feather')

    trainp = pd.read_csv('../input/train.csv', usecols=[target])
    testp = pd.read_csv('../input/test.csv', usecols=[target])
    trainp.fillna(trainp.mean(), inplace=True)
    testp.fillna(testp.mean(), inplace=True)
    train = pd.concat([train, trainp], axis=1)
    test = pd.concat([test, testp], axis=1)
    del trainp, testp; gc.collect()
    print(train.shape)
    print(test.shape)

    df = pd.concat([train, test])
    df.drop(nonaggfeats, axis=1, inplace=True)
    del train, test; gc.collect()
    print(df.shape)

    for comb in tqdm(list(itertools.combinations(aggfeats, i))):
        group_cols = list(comb)
        aggname = '-'.join(group_cols+[target]) + '-mean'
        gp = df[group_cols+[target]].groupby(group_cols)[target].mean().rename(aggname).to_frame().reset_index()
        df = df.merge(gp, on=group_cols, how='left')
        df[aggname] = df[aggname].fillna(df[aggname].mean())
        df[aggname] = df[aggname].astype('float32')
        gc.collect()
    df.drop(aggfeats+[target], axis=1, inplace=True)
    df = reduce_mem_usage(df)
    train = df[:lentrain]
    test = df[lentrain:]
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    print(train.shape)
    print(test.shape)
    train.to_feather('../features/train/Agg_by_price_mean_'+str(i)+'_train.feather')
    test.to_feather('../features/test/Agg_by_price_mean_'+str(i)+'_test.feather')
    print('save done ',str(i))
    del train, test, df; gc.collect()



(1503424, 12)
(508438, 12)
(2011862, 9)


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.16it/s]


Memory usage of dataframe is 76.75 MB
Memory usage after optimization is: 76.75 MB
Decreased by 0.0%
(1503424, 8)
(508438, 8)
save done  1
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:21<00:00,  1.30it/s]


Memory usage of dataframe is 230.24 MB
Memory usage after optimization is: 230.24 MB
Decreased by 0.0%
(1503424, 28)
(508438, 28)
save done  2
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [01:03<00:00,  1.14s/it]


Memory usage of dataframe is 445.13 MB
Memory usage after optimization is: 445.13 MB
Decreased by 0.0%
(1503424, 56)
(508438, 56)
save done  3
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:39<00:00,  1.42s/it]


Memory usage of dataframe is 552.57 MB
Memory usage after optimization is: 552.57 MB
Decreased by 0.0%
(1503424, 70)
(508438, 70)
save done  4


In [None]:
with open('./Golden_Price_mean_agg.pickle', 'rb') as f:
    agg_list = pickle.load(f)
    
print(agg_list)
target = 'price'

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
trainp = pd.read_csv('../input/train.csv', usecols=[target])
testp = pd.read_csv('../input/test.csv', usecols=[target])
trainp.fillna(trainp.mean(), inplace=True)
testp.fillna(testp.mean(), inplace=True)
train = pd.concat([train, trainp], axis=1)
test = pd.concat([test, testp], axis=1)
del trainp, testp; gc.collect()

train_active = pd.read_feather('../features/train_active/categorical_features_train_active.feather')
test_active = pd.read_feather('../features/test_active/categorical_features_test_active.feather')
trainap = pd.read_csv('../input/train_active.csv', usecols=[target])
testap = pd.read_csv('../input/test_active.csv', usecols=[target])
trainap.fillna(trainap.mean(), inplace=True)
testap.fillna(testap.mean(), inplace=True)
train_active = pd.concat([train_active, trainap], axis=1)
test_active = pd.concat([test_active, testap], axis=1)
del trainap, testap; gc.collect()

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

df = pd.concat([train, test, train_active, test_active])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for cols in tqdm(agg_list):
    group_cols = cols[:-1]
    assert target == cols[-1]
    print(group_cols)
    aggname = '-'.join(group_cols+[target]) + '-mean'
    gp = df[group_cols+[target]].groupby(group_cols)[target].mean().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(df[aggname].mean())
    df[aggname] = df[aggname].astype('float32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats+['price'], axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]
train_active = df[lentrain+lentest: lentrain+lentest+lentrainactive]
test_active = df[lentrain+lentest+lentrainactive: lentrain+lentest+lentrainactive+lentestactive]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_active.reset_index(drop=True, inplace=True)
test_active.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

train.to_feather('../features/train/Agg_Price_mean_Golden_features_train.feather')
test.to_feather('../features/test/Agg_Price_mean_Golden_features_test.feather')
train_active.to_feather('../features/train_active/Agg_Price_mean_Golden_features_train_active.feather')
test_active.to_feather('../features/test_active/Agg_Price_mean_Golden_features_test_active.feather')

print('done')
train.nunique()

[['region', 'user_type', 'weekofday', 'param_123', 'price'], ['city', 'user_type', 'weekofday', 'param_1', 'price'], ['city', 'category_name', 'user_type', 'weekofday', 'price'], ['user_type', 'price'], ['parent_category_name', 'user_type', 'param_1', 'price'], ['parent_category_name', 'param_1', 'price'], ['region', 'parent_category_name', 'price'], ['category_name', 'price'], ['parent_category_name', 'price'], ['category_name', 'user_type', 'price'], ['city', 'user_type', 'weekofday', 'price'], ['parent_category_name', 'category_name', 'user_type', 'price'], ['parent_category_name', 'category_name', 'price'], ['param_123', 'price'], ['region', 'weekofday', 'price'], ['region', 'user_type', 'weekofday', 'price'], ['city', 'user_type', 'weekofday', 'param_123', 'price'], ['city', 'price'], ['parent_category_name', 'user_type', 'price'], ['city', 'user_type', 'price'], ['param_1', 'price'], ['parent_category_name', 'param_123', 'price'], ['region', 'price'], ['region', 'city', 'user_typ

In [3]:
target = 'price'
for i in range(1, 5):
    train = pd.read_feather('../features/train/categorical_features_train.feather')
    test = pd.read_feather('../features/test/categorical_features_test.feather')

    trainp = pd.read_csv('../input/train.csv', usecols=[target])
    testp = pd.read_csv('../input/test.csv', usecols=[target])
    trainp.fillna(trainp.mean(), inplace=True)
    testp.fillna(testp.mean(), inplace=True)
    train = pd.concat([train, trainp], axis=1)
    test = pd.concat([test, testp], axis=1)
    del trainp, testp; gc.collect()
    print(train.shape)
    print(test.shape)

    df = pd.concat([train, test])
    df.drop(nonaggfeats, axis=1, inplace=True)
    del train, test; gc.collect()
    print(df.shape)

    for comb in tqdm(list(itertools.combinations(aggfeats, i))):
        group_cols = list(comb)
        aggname = '-'.join(group_cols+[target]) + '-var'
        gp = df[group_cols+[target]].groupby(group_cols)[target].var().rename(aggname).to_frame().reset_index()
        df = df.merge(gp, on=group_cols, how='left')
        df[aggname] = df[aggname].fillna(df[aggname].mean())
        df[aggname] = df[aggname].astype('float32')
        gc.collect()
    df.drop(aggfeats+[target], axis=1, inplace=True)
    df = reduce_mem_usage(df)
    train = df[:lentrain]
    test = df[lentrain:]
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    print(train.shape)
    print(test.shape)
    train.to_feather('../features/train/Agg_by_price_var_'+str(i)+'_train.feather')
    test.to_feather('../features/test/Agg_by_price_var_'+str(i)+'_test.feather')
    print('save done ',str(i))
    del train, test, df; gc.collect()



(1503424, 12)
(508438, 12)
(2011862, 9)


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.11it/s]


Memory usage of dataframe is 76.75 MB
Memory usage after optimization is: 76.75 MB
Decreased by 0.0%
(1503424, 8)
(508438, 8)
save done  1
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:22<00:00,  1.26it/s]


Memory usage of dataframe is 230.24 MB
Memory usage after optimization is: 230.24 MB
Decreased by 0.0%
(1503424, 28)
(508438, 28)
save done  2
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [01:04<00:00,  1.14s/it]


Memory usage of dataframe is 445.13 MB
Memory usage after optimization is: 445.13 MB
Decreased by 0.0%
(1503424, 56)
(508438, 56)
save done  3
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:38<00:00,  1.41s/it]


Memory usage of dataframe is 552.57 MB
Memory usage after optimization is: 552.57 MB
Decreased by 0.0%
(1503424, 70)
(508438, 70)
save done  4


In [None]:
with open('./Golden_Price_var_agg.pickle', 'rb') as f:
    agg_list = pickle.load(f)
    
print(agg_list)
target = 'price'

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
trainp = pd.read_csv('../input/train.csv', usecols=[target])
testp = pd.read_csv('../input/test.csv', usecols=[target])
trainp.fillna(trainp.mean(), inplace=True)
testp.fillna(testp.mean(), inplace=True)
train = pd.concat([train, trainp], axis=1)
test = pd.concat([test, testp], axis=1)
del trainp, testp; gc.collect()

train_active = pd.read_feather('../features/train_active/categorical_features_train_active.feather')
test_active = pd.read_feather('../features/test_active/categorical_features_test_active.feather')
trainap = pd.read_csv('../input/train_active.csv', usecols=[target])
testap = pd.read_csv('../input/test_active.csv', usecols=[target])
trainap.fillna(trainap.mean(), inplace=True)
testap.fillna(testap.mean(), inplace=True)
train_active = pd.concat([train_active, trainap], axis=1)
test_active = pd.concat([test_active, testap], axis=1)
del trainap, testap; gc.collect()

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

df = pd.concat([train, test, train_active, test_active])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for cols in tqdm(agg_list):
    group_cols = cols[:-1]
    assert target == cols[-1]
    print(group_cols)
    aggname = '-'.join(group_cols+[target]) + '-var'
    gp = df[group_cols+[target]].groupby(group_cols)[target].var().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(df[aggname].mean())
    df[aggname] = df[aggname].astype('float32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats+['price'], axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]
train_active = df[lentrain+lentest: lentrain+lentest+lentrainactive]
test_active = df[lentrain+lentest+lentrainactive: lentrain+lentest+lentrainactive+lentestactive]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_active.reset_index(drop=True, inplace=True)
test_active.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

train.to_feather('../features/train/Agg_Price_var_Golden_features_train.feather')
test.to_feather('../features/test/Agg_Price_var_Golden_features_test.feather')
train_active.to_feather('../features/train_active/Agg_Price_var_Golden_features_train_active.feather')
test_active.to_feather('../features/test_active/Agg_Price_var_Golden_features_test_active.feather')

print('done')
train.nunique()

In [5]:
target = 'price'
for i in range(1, 5):
    train = pd.read_feather('../features/train/categorical_features_train.feather')
    test = pd.read_feather('../features/test/categorical_features_test.feather')

    trainp = pd.read_csv('../input/train.csv', usecols=[target])
    testp = pd.read_csv('../input/test.csv', usecols=[target])
    trainp.fillna(trainp.mean(), inplace=True)
    testp.fillna(testp.mean(), inplace=True)
    train = pd.concat([train, trainp], axis=1)
    test = pd.concat([test, testp], axis=1)
    del trainp, testp; gc.collect()
    print(train.shape)
    print(test.shape)

    df = pd.concat([train, test])
    df.drop(nonaggfeats, axis=1, inplace=True)
    del train, test; gc.collect()
    print(df.shape)

    for comb in tqdm(list(itertools.combinations(aggfeats, i))):
        group_cols = list(comb)
        aggname = '-'.join(group_cols+[target]) + '-median'
        gp = df[group_cols+[target]].groupby(group_cols)[target].median().rename(aggname).to_frame().reset_index()
        df = df.merge(gp, on=group_cols, how='left')
        df[aggname] = df[aggname].fillna(df[aggname].mean())
        df[aggname] = df[aggname].astype('float32')
        gc.collect()
    df.drop(aggfeats+[target], axis=1, inplace=True)
    df = reduce_mem_usage(df)
    train = df[:lentrain]
    test = df[lentrain:]
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    print(train.shape)
    print(test.shape)
    train.to_feather('../features/train/Agg_by_price_median_'+str(i)+'_train.feather')
    test.to_feather('../features/test/Agg_by_price_median_'+str(i)+'_test.feather')
    print('save done ',str(i))
    del train, test, df; gc.collect()



(1503424, 12)
(508438, 12)
(2011862, 9)


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.00it/s]


Memory usage of dataframe is 76.75 MB
Memory usage after optimization is: 76.75 MB
Decreased by 0.0%
(1503424, 8)
(508438, 8)
save done  1
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:22<00:00,  1.25it/s]


Memory usage of dataframe is 230.24 MB
Memory usage after optimization is: 230.24 MB
Decreased by 0.0%
(1503424, 28)
(508438, 28)
save done  2
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [01:05<00:00,  1.17s/it]


Memory usage of dataframe is 445.13 MB
Memory usage after optimization is: 445.13 MB
Decreased by 0.0%
(1503424, 56)
(508438, 56)
save done  3
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:41<00:00,  1.45s/it]


Memory usage of dataframe is 552.57 MB
Memory usage after optimization is: 552.57 MB
Decreased by 0.0%
(1503424, 70)
(508438, 70)
save done  4


In [None]:
with open('./Golden_Price_median_agg.pickle', 'rb') as f:
    agg_list = pickle.load(f)
    
print(agg_list)
target = 'price'

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
trainp = pd.read_csv('../input/train.csv', usecols=[target])
testp = pd.read_csv('../input/test.csv', usecols=[target])
trainp.fillna(trainp.mean(), inplace=True)
testp.fillna(testp.mean(), inplace=True)
train = pd.concat([train, trainp], axis=1)
test = pd.concat([test, testp], axis=1)
del trainp, testp; gc.collect()

train_active = pd.read_feather('../features/train_active/categorical_features_train_active.feather')
test_active = pd.read_feather('../features/test_active/categorical_features_test_active.feather')
trainap = pd.read_csv('../input/train_active.csv', usecols=[target])
testap = pd.read_csv('../input/test_active.csv', usecols=[target])
trainap.fillna(trainap.mean(), inplace=True)
testap.fillna(testap.mean(), inplace=True)
train_active = pd.concat([train_active, trainap], axis=1)
test_active = pd.concat([test_active, testap], axis=1)
del trainap, testap; gc.collect()

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

df = pd.concat([train, test, train_active, test_active])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for cols in tqdm(agg_list):
    group_cols = cols[:-1]
    assert target == cols[-1]
    print(group_cols)
    aggname = '-'.join(group_cols+[target]) + '-median'
    gp = df[group_cols+[target]].groupby(group_cols)[target].median().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(df[aggname].mean())
    df[aggname] = df[aggname].astype('float32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats+['price'], axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]
train_active = df[lentrain+lentest: lentrain+lentest+lentrainactive]
test_active = df[lentrain+lentest+lentrainactive: lentrain+lentest+lentrainactive+lentestactive]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_active.reset_index(drop=True, inplace=True)
test_active.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

train.to_feather('../features/train/Agg_Price_median_Golden_features_train.feather')
test.to_feather('../features/test/Agg_Price_median_Golden_features_test.feather')
train_active.to_feather('../features/train_active/Agg_Price_median_Golden_features_train_active.feather')
test_active.to_feather('../features/test_active/Agg_Price_median_Golden_features_test_active.feather')

print('done')
train.nunique()

In [5]:
target = 'image_top_1'
for i in range(1, 5):
    train = pd.read_feather('../features/train/categorical_features_train.feather')
    test = pd.read_feather('../features/test/categorical_features_test.feather')

    trainp = pd.read_csv('../input/train.csv', usecols=[target])
    testp = pd.read_csv('../input/test.csv', usecols=[target])
    trainp.fillna(0, inplace=True)
    testp.fillna(0, inplace=True)
    train = pd.concat([train, trainp], axis=1)
    test = pd.concat([test, testp], axis=1)
    del trainp, testp; gc.collect()
    print(train.shape)
    print(test.shape)

    df = pd.concat([train, test])
    df.drop(nonaggfeats, axis=1, inplace=True)
    del train, test; gc.collect()
    print(df.shape)

    for comb in tqdm(list(itertools.combinations(aggfeats, i))):
        group_cols = list(comb)
        aggname = '-'.join(group_cols+[target]) + '-mean'
        gp = df[group_cols+[target]].groupby(group_cols)[target].mean().rename(aggname).to_frame().reset_index()
        df = df.merge(gp, on=group_cols, how='left')
        df[aggname] = df[aggname].fillna(0)
        df[aggname] = df[aggname].astype('float32')
        gc.collect()
    df.drop(aggfeats+[target], axis=1, inplace=True)
    df = reduce_mem_usage(df)
    train = df[:lentrain]
    test = df[lentrain:]
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    print(train.shape)
    print(test.shape)
    train.to_feather('../features/train/Agg_by_imagetop1_mean_'+str(i)+'_train.feather')
    test.to_feather('../features/test/Agg_by_imagetop1_mean_'+str(i)+'_test.feather')
    print('save done ',str(i))
    del train, test, df; gc.collect()


(1503424, 12)
(508438, 12)
(2011862, 9)


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.09it/s]


Memory usage of dataframe is 76.75 MB
Memory usage after optimization is: 76.75 MB
Decreased by 0.0%
(1503424, 8)
(508438, 8)
save done  1
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:21<00:00,  1.30it/s]


Memory usage of dataframe is 230.24 MB
Memory usage after optimization is: 230.24 MB
Decreased by 0.0%
(1503424, 28)
(508438, 28)
save done  2
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [01:04<00:00,  1.15s/it]


Memory usage of dataframe is 445.13 MB
Memory usage after optimization is: 445.13 MB
Decreased by 0.0%
(1503424, 56)
(508438, 56)
save done  3
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:40<00:00,  1.44s/it]


Memory usage of dataframe is 552.57 MB
Memory usage after optimization is: 552.57 MB
Decreased by 0.0%
(1503424, 70)
(508438, 70)
save done  4


In [3]:
with open('./Golden_Imagetop1_mean_agg.pickle', 'rb') as f:
    agg_list = pickle.load(f)
    
print(agg_list)
target = 'image_top_1'

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
trainp = pd.read_csv('../input/train.csv', usecols=[target])
testp = pd.read_csv('../input/test.csv', usecols=[target])
trainp.fillna(-999, inplace=True)
testp.fillna(-999, inplace=True)
train = pd.concat([train, trainp], axis=1)
test = pd.concat([test, testp], axis=1)
del trainp, testp; gc.collect()

print(train.shape)
print(test.shape)

df = pd.concat([train, test])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for cols in tqdm(agg_list):
    group_cols = cols[:-1]
    assert target == cols[-1]
    print(group_cols)
    aggname = '-'.join(group_cols+[target]) + '-mean'
    gp = df[group_cols+[target]].groupby(group_cols)[target].mean().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(-999)
    df[aggname] = df[aggname].astype('float32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats+[target], axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)

train.to_feather('../features/train/Agg_Imagetop1_mean_Golden_features_train.feather')
test.to_feather('../features/test/Agg_Imagetop1_mean_Golden_features_test.feather')

print('done')
train.nunique()

[['city', 'parent_category_name', 'user_type', 'weekofday', 'image_top_1'], ['parent_category_name', 'user_type', 'image_top_1'], ['region', 'weekofday', 'image_top_1'], ['region', 'user_type', 'weekofday', 'image_top_1'], ['city', 'user_type', 'weekofday', 'param_1', 'image_top_1'], ['parent_category_name', 'user_type', 'weekofday', 'image_top_1'], ['region', 'user_type', 'weekofday', 'param_123', 'image_top_1'], ['category_name', 'weekofday', 'image_top_1'], ['city', 'weekofday', 'image_top_1'], ['city', 'user_type', 'weekofday', 'image_top_1'], ['city', 'user_type', 'weekofday', 'param_123', 'image_top_1'], ['parent_category_name', 'category_name', 'image_top_1'], ['user_type', 'param_1', 'image_top_1'], ['category_name', 'image_top_1'], ['city', 'image_top_1'], ['user_type', 'param_123', 'image_top_1'], ['user_type', 'image_top_1'], ['city', 'user_type', 'image_top_1'], ['user_type', 'weekofday', 'image_top_1'], ['param_123', 'image_top_1'], ['region', 'parent_category_name', 'imag

  0%|                                                                                           | 0/26 [00:00<?, ?it/s]

['city', 'parent_category_name', 'user_type', 'weekofday']


  4%|███▏                                                                               | 1/26 [00:00<00:21,  1.15it/s]

['parent_category_name', 'user_type']


  8%|██████▍                                                                            | 2/26 [00:01<00:16,  1.48it/s]

['region', 'weekofday']


 12%|█████████▌                                                                         | 3/26 [00:02<00:15,  1.46it/s]

['region', 'user_type', 'weekofday']


 15%|████████████▊                                                                      | 4/26 [00:02<00:15,  1.43it/s]

['city', 'user_type', 'weekofday', 'param_1']


 19%|███████████████▉                                                                   | 5/26 [00:03<00:16,  1.27it/s]

['parent_category_name', 'user_type', 'weekofday']


 23%|███████████████████▏                                                               | 6/26 [00:04<00:15,  1.27it/s]

['region', 'user_type', 'weekofday', 'param_123']


 27%|██████████████████████▎                                                            | 7/26 [00:05<00:15,  1.19it/s]

['category_name', 'weekofday']


 31%|█████████████████████████▌                                                         | 8/26 [00:06<00:14,  1.20it/s]

['city', 'weekofday']


 35%|████████████████████████████▋                                                      | 9/26 [00:07<00:14,  1.20it/s]

['city', 'user_type', 'weekofday']


 38%|███████████████████████████████▌                                                  | 10/26 [00:08<00:13,  1.19it/s]

['city', 'user_type', 'weekofday', 'param_123']


 42%|██████████████████████████████████▋                                               | 11/26 [00:09<00:13,  1.11it/s]

['parent_category_name', 'category_name']


 46%|█████████████████████████████████████▊                                            | 12/26 [00:10<00:12,  1.13it/s]

['user_type', 'param_1']


 50%|█████████████████████████████████████████                                         | 13/26 [00:11<00:11,  1.15it/s]

['category_name']


 54%|████████████████████████████████████████████▏                                     | 14/26 [00:11<00:10,  1.18it/s]

['city']


 58%|███████████████████████████████████████████████▎                                  | 15/26 [00:12<00:09,  1.20it/s]

['user_type', 'param_123']


 62%|██████████████████████████████████████████████████▍                               | 16/26 [00:13<00:08,  1.20it/s]

['user_type']


 65%|█████████████████████████████████████████████████████▌                            | 17/26 [00:13<00:07,  1.22it/s]

['city', 'user_type']


 69%|████████████████████████████████████████████████████████▊                         | 18/26 [00:14<00:06,  1.22it/s]

['user_type', 'weekofday']


 73%|███████████████████████████████████████████████████████████▉                      | 19/26 [00:15<00:05,  1.22it/s]

['param_123']


 77%|███████████████████████████████████████████████████████████████                   | 20/26 [00:16<00:04,  1.22it/s]

['region', 'parent_category_name']


 81%|██████████████████████████████████████████████████████████████████▏               | 21/26 [00:17<00:04,  1.22it/s]

['category_name', 'param_1']


 85%|█████████████████████████████████████████████████████████████████████▍            | 22/26 [00:17<00:03,  1.22it/s]

['param_1']


 88%|████████████████████████████████████████████████████████████████████████▌         | 23/26 [00:18<00:02,  1.23it/s]

['parent_category_name']


 92%|███████████████████████████████████████████████████████████████████████████▋      | 24/26 [00:19<00:01,  1.23it/s]

['region', 'category_name']


 96%|██████████████████████████████████████████████████████████████████████████████▊   | 25/26 [00:20<00:00,  1.22it/s]

['parent_category_name', 'user_type', 'param_1']


100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:21<00:00,  1.22it/s]


(2011862, 35)
Memory usage of dataframe is 257.10 MB
Memory usage after optimization is: 249.43 MB
Decreased by 3.0%
(1503424, 26)
(508438, 26)
done


city-parent_category_name-user_type-weekofday-image_top_1-mean    31912
parent_category_name-user_type-image_top_1-mean                      27
region-weekofday-image_top_1-mean                                   196
region-user_type-weekofday-image_top_1-mean                         588
city-user_type-weekofday-param_1-image_top_1-mean                 55552
parent_category_name-user_type-weekofday-image_top_1-mean           189
region-user_type-weekofday-param_123-image_top_1-mean             70172
category_name-weekofday-image_top_1-mean                            329
city-weekofday-image_top_1-mean                                    8202
city-user_type-weekofday-image_top_1-mean                         12649
city-user_type-weekofday-param_123-image_top_1-mean               65897
parent_category_name-category_name-image_top_1-mean                  47
user_type-param_1-image_top_1-mean                                  977
category_name-image_top_1-mean                                  

In [8]:
target = 'image_top_1'
for i in range(1, 5):
    train = pd.read_feather('../features/train/categorical_features_train.feather')
    test = pd.read_feather('../features/test/categorical_features_test.feather')

    trainp = pd.read_csv('../input/train.csv', usecols=[target])
    testp = pd.read_csv('../input/test.csv', usecols=[target])
    trainp.fillna(-999, inplace=True)
    testp.fillna(-999, inplace=True)
    train = pd.concat([train, trainp], axis=1)
    test = pd.concat([test, testp], axis=1)
    del trainp, testp; gc.collect()
    print(train.shape)
    print(test.shape)

    df = pd.concat([train, test])
    df.drop(nonaggfeats, axis=1, inplace=True)
    del train, test; gc.collect()
    print(df.shape)

    for comb in tqdm(list(itertools.combinations(aggfeats, i))):
        group_cols = list(comb)
        aggname = '-'.join(group_cols+[target]) + '-var'
        gp = df[group_cols+[target]].groupby(group_cols)[target].var().rename(aggname).to_frame().reset_index()
        df = df.merge(gp, on=group_cols, how='left')
        df[aggname] = df[aggname].fillna(-999)
        df[aggname] = df[aggname].astype('float32')
        gc.collect()
    df.drop(aggfeats+[target], axis=1, inplace=True)
    df = reduce_mem_usage(df)
    train = df[:lentrain]
    test = df[lentrain:]
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    print(train.shape)
    print(test.shape)
    train.to_feather('../features/train/Agg_by_imagetop1_var_'+str(i)+'_train.feather')
    test.to_feather('../features/test/Agg_by_imagetop1_var_'+str(i)+'_test.feather')
    print('save done ',str(i))
    del train, test, df; gc.collect()


(1503424, 12)
(508438, 12)
(2011862, 9)


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.21it/s]


Memory usage of dataframe is 76.75 MB
Memory usage after optimization is: 76.75 MB
Decreased by 0.0%
(1503424, 8)
(508438, 8)
save done  1
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:21<00:00,  1.28it/s]


Memory usage of dataframe is 230.24 MB
Memory usage after optimization is: 230.24 MB
Decreased by 0.0%
(1503424, 28)
(508438, 28)
save done  2
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [01:04<00:00,  1.15s/it]


Memory usage of dataframe is 445.13 MB
Memory usage after optimization is: 445.13 MB
Decreased by 0.0%
(1503424, 56)
(508438, 56)
save done  3
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:39<00:00,  1.42s/it]


Memory usage of dataframe is 552.57 MB
Memory usage after optimization is: 552.57 MB
Decreased by 0.0%
(1503424, 70)
(508438, 70)
save done  4


In [4]:
with open('./Golden_Imagetop1_var_agg.pickle', 'rb') as f:
    agg_list = pickle.load(f)
    
print(agg_list)
target = 'image_top_1'

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
trainp = pd.read_csv('../input/train.csv', usecols=[target])
testp = pd.read_csv('../input/test.csv', usecols=[target])
trainp.fillna(0, inplace=True)
testp.fillna(0, inplace=True)
train = pd.concat([train, trainp], axis=1)
test = pd.concat([test, testp], axis=1)
del trainp, testp; gc.collect()

print(train.shape)
print(test.shape)

df = pd.concat([train, test])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for cols in tqdm(agg_list):
    group_cols = cols[:-1]
    assert target == cols[-1]
    print(group_cols)
    aggname = '-'.join(group_cols+[target]) + '-var'
    gp = df[group_cols+[target]].groupby(group_cols)[target].var().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(0)
    df[aggname] = df[aggname].astype('float32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats+[target], axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)

train.to_feather('../features/train/Agg_Imagetop1_var_Golden_features_train.feather')
test.to_feather('../features/test/Agg_Imagetop1_var_Golden_features_test.feather')

print('done')
train.nunique()

[['param_123', 'image_top_1'], ['parent_category_name', 'image_top_1'], ['region', 'parent_category_name', 'image_top_1'], ['region', 'user_type', 'weekofday', 'image_top_1'], ['city', 'parent_category_name', 'user_type', 'image_top_1'], ['category_name', 'user_type', 'image_top_1'], ['parent_category_name', 'category_name', 'image_top_1'], ['region', 'user_type', 'image_top_1'], ['city', 'image_top_1'], ['user_type', 'image_top_1'], ['parent_category_name', 'param_1', 'image_top_1'], ['param_1', 'image_top_1'], ['category_name', 'weekofday', 'image_top_1'], ['user_type', 'weekofday', 'image_top_1'], ['city', 'user_type', 'image_top_1'], ['city', 'parent_category_name', 'user_type', 'weekofday', 'image_top_1'], ['category_name', 'param_1', 'image_top_1'], ['parent_category_name', 'category_name', 'user_type', 'image_top_1'], ['parent_category_name', 'param_123', 'image_top_1'], ['city', 'user_type', 'weekofday', 'image_top_1'], ['city', 'category_name', 'user_type', 'weekofday', 'image

  0%|                                                                                           | 0/26 [00:00<?, ?it/s]

['param_123']


  4%|███▏                                                                               | 1/26 [00:00<00:10,  2.37it/s]

['parent_category_name']


  8%|██████▍                                                                            | 2/26 [00:00<00:09,  2.44it/s]

['region', 'parent_category_name']


 12%|█████████▌                                                                         | 3/26 [00:01<00:10,  2.20it/s]

['region', 'user_type', 'weekofday']


 15%|████████████▊                                                                      | 4/26 [00:02<00:11,  1.84it/s]

['city', 'parent_category_name', 'user_type']


 19%|███████████████▉                                                                   | 5/26 [00:02<00:12,  1.72it/s]

['category_name', 'user_type']


 23%|███████████████████▏                                                               | 6/26 [00:03<00:11,  1.71it/s]

['parent_category_name', 'category_name']


 27%|██████████████████████▎                                                            | 7/26 [00:04<00:11,  1.71it/s]

['region', 'user_type']


 31%|█████████████████████████▌                                                         | 8/26 [00:04<00:10,  1.70it/s]

['city']


 35%|████████████████████████████▋                                                      | 9/26 [00:05<00:09,  1.72it/s]

['user_type']


 38%|███████████████████████████████▌                                                  | 10/26 [00:05<00:09,  1.73it/s]

['parent_category_name', 'param_1']


 42%|██████████████████████████████████▋                                               | 11/26 [00:06<00:08,  1.71it/s]

['param_1']


 46%|█████████████████████████████████████▊                                            | 12/26 [00:07<00:08,  1.71it/s]

['category_name', 'weekofday']


 50%|█████████████████████████████████████████                                         | 13/26 [00:07<00:07,  1.64it/s]

['user_type', 'weekofday']


 54%|████████████████████████████████████████████▏                                     | 14/26 [00:08<00:07,  1.60it/s]

['city', 'user_type']


 58%|███████████████████████████████████████████████▎                                  | 15/26 [00:09<00:06,  1.58it/s]

['city', 'parent_category_name', 'user_type', 'weekofday']


 62%|██████████████████████████████████████████████████▍                               | 16/26 [00:10<00:06,  1.51it/s]

['category_name', 'param_1']


 65%|█████████████████████████████████████████████████████▌                            | 17/26 [00:11<00:06,  1.49it/s]

['parent_category_name', 'category_name', 'user_type']


 69%|████████████████████████████████████████████████████████▊                         | 18/26 [00:12<00:05,  1.47it/s]

['parent_category_name', 'param_123']


 73%|███████████████████████████████████████████████████████████▉                      | 19/26 [00:13<00:04,  1.46it/s]

['city', 'user_type', 'weekofday']


 77%|███████████████████████████████████████████████████████████████                   | 20/26 [00:14<00:04,  1.41it/s]

['city', 'category_name', 'user_type', 'weekofday']


 81%|██████████████████████████████████████████████████████████████████▏               | 21/26 [00:15<00:03,  1.36it/s]

['region', 'city', 'user_type']


 85%|█████████████████████████████████████████████████████████████████████▍            | 22/26 [00:16<00:02,  1.34it/s]

['category_name']


 88%|████████████████████████████████████████████████████████████████████████▌         | 23/26 [00:17<00:02,  1.34it/s]

['region', 'weekofday']


 92%|███████████████████████████████████████████████████████████████████████████▋      | 24/26 [00:18<00:01,  1.32it/s]

['city', 'weekofday']


 96%|██████████████████████████████████████████████████████████████████████████████▊   | 25/26 [00:19<00:00,  1.30it/s]

['parent_category_name', 'user_type']


100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:20<00:00,  1.29it/s]


(2011862, 35)
Memory usage of dataframe is 257.10 MB
Memory usage after optimization is: 249.43 MB
Decreased by 3.0%
(1503424, 26)
(508438, 26)
done


param_123-image_top_1-var                                         1860
parent_category_name-image_top_1-var                                 9
region-parent_category_name-image_top_1-var                        252
region-user_type-weekofday-image_top_1-var                         588
city-parent_category_name-user_type-image_top_1-var              13130
category_name-user_type-image_top_1-var                            141
parent_category_name-category_name-image_top_1-var                  47
region-user_type-image_top_1-var                                    84
city-image_top_1-var                                              1699
user_type-image_top_1-var                                            3
parent_category_name-param_1-image_top_1-var                       381
param_1-image_top_1-var                                            367
category_name-weekofday-image_top_1-var                            329
user_type-weekofday-image_top_1-var                                 21
city-u

In [10]:
target = 'image_top_1'
for i in range(1, 5):
    train = pd.read_feather('../features/train/categorical_features_train.feather')
    test = pd.read_feather('../features/test/categorical_features_test.feather')

    trainp = pd.read_csv('../input/train.csv', usecols=[target])
    testp = pd.read_csv('../input/test.csv', usecols=[target])
    trainp.fillna(-999, inplace=True)
    testp.fillna(-999, inplace=True)
    train = pd.concat([train, trainp], axis=1)
    test = pd.concat([test, testp], axis=1)
    del trainp, testp; gc.collect()
    print(train.shape)
    print(test.shape)

    df = pd.concat([train, test])
    df.drop(nonaggfeats, axis=1, inplace=True)
    del train, test; gc.collect()
    print(df.shape)

    for comb in tqdm(list(itertools.combinations(aggfeats, i))):
        group_cols = list(comb)
        aggname = '-'.join(group_cols+[target]) + '-median'
        gp = df[group_cols+[target]].groupby(group_cols)[target].median().rename(aggname).to_frame().reset_index()
        df = df.merge(gp, on=group_cols, how='left')
        df[aggname] = df[aggname].fillna(-999)
        df[aggname] = df[aggname].astype('float32')
        gc.collect()
    df.drop(aggfeats+[target], axis=1, inplace=True)
    df = reduce_mem_usage(df)
    train = df[:lentrain]
    test = df[lentrain:]
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    print(train.shape)
    print(test.shape)
    train.to_feather('../features/train/Agg_by_imagetop1_median_'+str(i)+'_train.feather')
    test.to_feather('../features/test/Agg_by_imagetop1_median_'+str(i)+'_test.feather')
    print('save done ',str(i))
    del train, test, df; gc.collect()


(1503424, 12)
(508438, 12)
(2011862, 9)


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.02it/s]


Memory usage of dataframe is 76.75 MB
Memory usage after optimization is: 76.75 MB
Decreased by 0.0%
(1503424, 8)
(508438, 8)
save done  1
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:22<00:00,  1.22it/s]


Memory usage of dataframe is 230.24 MB
Memory usage after optimization is: 230.24 MB
Decreased by 0.0%
(1503424, 28)
(508438, 28)
save done  2
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [01:06<00:00,  1.19s/it]


Memory usage of dataframe is 445.13 MB
Memory usage after optimization is: 445.13 MB
Decreased by 0.0%
(1503424, 56)
(508438, 56)
save done  3
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:43<00:00,  1.48s/it]


Memory usage of dataframe is 552.57 MB
Memory usage after optimization is: 552.57 MB
Decreased by 0.0%
(1503424, 70)
(508438, 70)
save done  4


In [5]:
with open('./Golden_Imagetop1_median_agg.pickle', 'rb') as f:
    agg_list = pickle.load(f)
    
print(agg_list)
target = 'image_top_1'

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
trainp = pd.read_csv('../input/train.csv', usecols=[target])
testp = pd.read_csv('../input/test.csv', usecols=[target])
trainp.fillna(0, inplace=True)
testp.fillna(0, inplace=True)
train = pd.concat([train, trainp], axis=1)
test = pd.concat([test, testp], axis=1)
del trainp, testp; gc.collect()

print(train.shape)
print(test.shape)

df = pd.concat([train, test])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for cols in tqdm(agg_list):
    group_cols = cols[:-1]
    assert target == cols[-1]
    print(group_cols)
    aggname = '-'.join(group_cols+[target]) + '-median'
    gp = df[group_cols+[target]].groupby(group_cols)[target].median().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(0)
    df[aggname] = df[aggname].astype('float32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats+[target], axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)

train.to_feather('../features/train/Agg_Imagetop1_median_Golden_features_train.feather')
test.to_feather('../features/test/Agg_Imagetop1_median_Golden_features_test.feather')

print('done')
train.nunique()

[['region', 'category_name', 'image_top_1'], ['parent_category_name', 'category_name', 'weekofday', 'image_top_1'], ['category_name', 'weekofday', 'image_top_1'], ['user_type', 'param_1', 'image_top_1'], ['param_1', 'image_top_1'], ['city', 'parent_category_name', 'user_type', 'image_top_1'], ['city', 'user_type', 'weekofday', 'image_top_1'], ['city', 'image_top_1'], ['city', 'weekofday', 'image_top_1'], ['user_type', 'image_top_1'], ['parent_category_name', 'weekofday', 'image_top_1'], ['region', 'city', 'user_type', 'image_top_1'], ['parent_category_name', 'image_top_1'], ['param_123', 'image_top_1'], ['region', 'image_top_1'], ['city', 'user_type', 'image_top_1'], ['region', 'user_type', 'weekofday', 'image_top_1'], ['region', 'user_type', 'image_top_1'], ['city', 'user_type', 'weekofday', 'param_123', 'image_top_1'], ['region', 'city', 'weekofday', 'image_top_1'], ['region', 'parent_category_name', 'image_top_1'], ['category_name', 'image_top_1'], ['city', 'parent_category_name', '

  0%|                                                                                           | 0/28 [00:00<?, ?it/s]

['region', 'category_name']


  4%|██▉                                                                                | 1/28 [00:00<00:13,  1.96it/s]

['parent_category_name', 'category_name', 'weekofday']


  7%|█████▉                                                                             | 2/28 [00:01<00:16,  1.58it/s]

['category_name', 'weekofday']


 11%|████████▉                                                                          | 3/28 [00:01<00:16,  1.52it/s]

['user_type', 'param_1']


 14%|███████████▊                                                                       | 4/28 [00:02<00:15,  1.55it/s]

['param_1']


 18%|██████████████▊                                                                    | 5/28 [00:03<00:14,  1.63it/s]

['city', 'parent_category_name', 'user_type']


 21%|█████████████████▊                                                                 | 6/28 [00:03<00:14,  1.57it/s]

['city', 'user_type', 'weekofday']


 25%|████████████████████▊                                                              | 7/28 [00:04<00:14,  1.48it/s]

['city']


 29%|███████████████████████▋                                                           | 8/28 [00:05<00:13,  1.51it/s]

['city', 'weekofday']


 32%|██████████████████████████▋                                                        | 9/28 [00:06<00:13,  1.46it/s]

['user_type']


 36%|█████████████████████████████▎                                                    | 10/28 [00:06<00:12,  1.50it/s]

['parent_category_name', 'weekofday']


 39%|████████████████████████████████▏                                                 | 11/28 [00:07<00:11,  1.46it/s]

['region', 'city', 'user_type']


 43%|███████████████████████████████████▏                                              | 12/28 [00:08<00:11,  1.44it/s]

['parent_category_name']


 46%|██████████████████████████████████████                                            | 13/28 [00:08<00:10,  1.46it/s]

['param_123']


 50%|█████████████████████████████████████████                                         | 14/28 [00:09<00:09,  1.46it/s]

['region']


 54%|███████████████████████████████████████████▉                                      | 15/28 [00:10<00:08,  1.47it/s]

['city', 'user_type']


 57%|██████████████████████████████████████████████▊                                   | 16/28 [00:11<00:08,  1.45it/s]

['region', 'user_type', 'weekofday']


 61%|█████████████████████████████████████████████████▊                                | 17/28 [00:12<00:07,  1.41it/s]

['region', 'user_type']


 64%|████████████████████████████████████████████████████▋                             | 18/28 [00:12<00:07,  1.40it/s]

['city', 'user_type', 'weekofday', 'param_123']


 68%|███████████████████████████████████████████████████████▋                          | 19/28 [00:14<00:06,  1.30it/s]

['region', 'city', 'weekofday']


 71%|██████████████████████████████████████████████████████████▌                       | 20/28 [00:15<00:06,  1.27it/s]

['region', 'parent_category_name']


 75%|█████████████████████████████████████████████████████████████▌                    | 21/28 [00:16<00:05,  1.27it/s]

['category_name']


 79%|████████████████████████████████████████████████████████████████▍                 | 22/28 [00:17<00:04,  1.28it/s]

['city', 'parent_category_name', 'user_type', 'weekofday']


 82%|███████████████████████████████████████████████████████████████████▎              | 23/28 [00:18<00:04,  1.24it/s]

['region', 'city', 'user_type', 'weekofday']


 86%|██████████████████████████████████████████████████████████████████████▎           | 24/28 [00:19<00:03,  1.21it/s]

['region', 'weekofday']


 89%|█████████████████████████████████████████████████████████████████████████▏        | 25/28 [00:20<00:02,  1.20it/s]

['parent_category_name', 'category_name']


 93%|████████████████████████████████████████████████████████████████████████████▏     | 26/28 [00:21<00:01,  1.19it/s]

['user_type', 'weekofday']


 96%|███████████████████████████████████████████████████████████████████████████████   | 27/28 [00:22<00:00,  1.18it/s]

['parent_category_name', 'user_type', 'param_1']


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:23<00:00,  1.17it/s]


(2011862, 37)
Memory usage of dataframe is 272.45 MB
Memory usage after optimization is: 264.78 MB
Decreased by 2.8%
(1503424, 28)
(508438, 28)
done


region-category_name-image_top_1-median                              422
parent_category_name-category_name-weekofday-image_top_1-median      128
category_name-weekofday-image_top_1-median                           128
user_type-param_1-image_top_1-median                                 534
param_1-image_top_1-median                                           330
city-parent_category_name-user_type-image_top_1-median              3565
city-user_type-weekofday-image_top_1-median                         3507
city-image_top_1-median                                              801
city-weekofday-image_top_1-median                                   2575
user_type-image_top_1-median                                           3
parent_category_name-weekofday-image_top_1-median                     36
region-city-user_type-image_top_1-median                            1491
parent_category_name-image_top_1-median                                9
param_123-image_top_1-median                       

In [12]:
target = 'item_seq_number'
for i in range(1, 5):
    train = pd.read_feather('../features/train/categorical_features_train.feather')
    test = pd.read_feather('../features/test/categorical_features_test.feather')

    trainp = pd.read_csv('../input/train.csv', usecols=[target])
    testp = pd.read_csv('../input/test.csv', usecols=[target])
    trainp.fillna(trainp.mean(), inplace=True)
    testp.fillna(testp.mean(), inplace=True)
    train = pd.concat([train, trainp], axis=1)
    test = pd.concat([test, testp], axis=1)
    del trainp, testp; gc.collect()
    print(train.shape)
    print(test.shape)

    df = pd.concat([train, test])
    df.drop(nonaggfeats, axis=1, inplace=True)
    del train, test; gc.collect()
    print(df.shape)

    for comb in tqdm(list(itertools.combinations(aggfeats, i))):
        group_cols = list(comb)
        aggname = '-'.join(group_cols+[target]) + '-mean'
        gp = df[group_cols+[target]].groupby(group_cols)[target].mean().rename(aggname).to_frame().reset_index()
        df = df.merge(gp, on=group_cols, how='left')
        df[aggname] = df[aggname].fillna(df[aggname].mean())
        df[aggname] = df[aggname].astype('float32')
        gc.collect()
    df.drop(aggfeats+[target], axis=1, inplace=True)
    df = reduce_mem_usage(df)
    train = df[:lentrain]
    test = df[lentrain:]
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    print(train.shape)
    print(test.shape)
    train.to_feather('../features/train/Agg_by_Itemseq_mean_'+str(i)+'_train.feather')
    test.to_feather('../features/test/Agg_by_Itemseq_mean_'+str(i)+'_test.feather')
    print('save done ',str(i))
    del train, test, df; gc.collect()
    

(1503424, 12)
(508438, 12)
(2011862, 9)


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.01it/s]


Memory usage of dataframe is 76.75 MB
Memory usage after optimization is: 76.75 MB
Decreased by 0.0%
(1503424, 8)
(508438, 8)
save done  1
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:22<00:00,  1.26it/s]


Memory usage of dataframe is 230.24 MB
Memory usage after optimization is: 230.24 MB
Decreased by 0.0%
(1503424, 28)
(508438, 28)
save done  2
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [01:04<00:00,  1.15s/it]


Memory usage of dataframe is 445.13 MB
Memory usage after optimization is: 445.13 MB
Decreased by 0.0%
(1503424, 56)
(508438, 56)
save done  3
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:38<00:00,  1.41s/it]


Memory usage of dataframe is 552.57 MB
Memory usage after optimization is: 552.57 MB
Decreased by 0.0%
(1503424, 70)
(508438, 70)
save done  4


In [None]:
with open('./Golden_Itemseq_mean_agg.pickle', 'rb') as f:
    agg_list = pickle.load(f)
    
print(agg_list)
target = 'item_seq_number'

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
trainp = pd.read_csv('../input/train.csv', usecols=[target])
testp = pd.read_csv('../input/test.csv', usecols=[target])
trainp.fillna(trainp.mean(), inplace=True)
testp.fillna(testp.mean(), inplace=True)
train = pd.concat([train, trainp], axis=1)
test = pd.concat([test, testp], axis=1)
del trainp, testp; gc.collect()

train_active = pd.read_feather('../features/train_active/categorical_features_train_active.feather')
test_active = pd.read_feather('../features/test_active/categorical_features_test_active.feather')
trainap = pd.read_csv('../input/train_active.csv', usecols=[target])
testap = pd.read_csv('../input/test_active.csv', usecols=[target])
trainap.fillna(trainap.mean(), inplace=True)
testap.fillna(testap.mean(), inplace=True)
train_active = pd.concat([train_active, trainap], axis=1)
test_active = pd.concat([test_active, testap], axis=1)
del trainap, testap; gc.collect()

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

df = pd.concat([train, test, train_active, test_active])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for cols in tqdm(agg_list):
    group_cols = cols[:-1]
    assert target == cols[-1]
    print(group_cols)
    aggname = '-'.join(group_cols+[target]) + '-mean'
    gp = df[group_cols+[target]].groupby(group_cols)[target].mean().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(df[aggname].mean())
    df[aggname] = df[aggname].astype('float32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats+[target], axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]
train_active = df[lentrain+lentest: lentrain+lentest+lentrainactive]
test_active = df[lentrain+lentest+lentrainactive: lentrain+lentest+lentrainactive+lentestactive]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_active.reset_index(drop=True, inplace=True)
test_active.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

train.to_feather('../features/train/Agg_Itemseq_mean_Golden_features_train.feather')
test.to_feather('../features/test/Agg_Itemseq_mean_Golden_features_test.feather')
train_active.to_feather('../features/train_active/Agg_Itemseq_mean_Golden_features_train_active.feather')
test_active.to_feather('../features/test_active/Agg_Itemseq_mean_Golden_features_test_active.feather')

print('done')
train.nunique()

In [14]:
target = 'item_seq_number'
for i in range(1, 5):
    train = pd.read_feather('../features/train/categorical_features_train.feather')
    test = pd.read_feather('../features/test/categorical_features_test.feather')

    trainp = pd.read_csv('../input/train.csv', usecols=[target])
    testp = pd.read_csv('../input/test.csv', usecols=[target])
    trainp.fillna(trainp.mean(), inplace=True)
    testp.fillna(testp.mean(), inplace=True)
    train = pd.concat([train, trainp], axis=1)
    test = pd.concat([test, testp], axis=1)
    del trainp, testp; gc.collect()
    print(train.shape)
    print(test.shape)

    df = pd.concat([train, test])
    df.drop(nonaggfeats, axis=1, inplace=True)
    del train, test; gc.collect()
    print(df.shape)

    for comb in tqdm(list(itertools.combinations(aggfeats, i))):
        group_cols = list(comb)
        aggname = '-'.join(group_cols+[target]) + '-var'
        gp = df[group_cols+[target]].groupby(group_cols)[target].var().rename(aggname).to_frame().reset_index()
        df = df.merge(gp, on=group_cols, how='left')
        df[aggname] = df[aggname].fillna(df[aggname].mean())
        df[aggname] = df[aggname].astype('float32')
        gc.collect()
    df.drop(aggfeats+[target], axis=1, inplace=True)
    df = reduce_mem_usage(df)
    train = df[:lentrain]
    test = df[lentrain:]
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    print(train.shape)
    print(test.shape)
    train.to_feather('../features/train/Agg_by_Itemseq_var_'+str(i)+'_train.feather')
    test.to_feather('../features/test/Agg_by_Itemseq_var_'+str(i)+'_test.feather')
    print('save done ',str(i))
    del train, test, df; gc.collect()
    

(1503424, 12)
(508438, 12)
(2011862, 9)


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.09it/s]


Memory usage of dataframe is 76.75 MB
Memory usage after optimization is: 76.75 MB
Decreased by 0.0%
(1503424, 8)
(508438, 8)
save done  1
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:21<00:00,  1.29it/s]


Memory usage of dataframe is 230.24 MB
Memory usage after optimization is: 230.24 MB
Decreased by 0.0%
(1503424, 28)
(508438, 28)
save done  2
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [01:04<00:00,  1.14s/it]


Memory usage of dataframe is 445.13 MB
Memory usage after optimization is: 445.13 MB
Decreased by 0.0%
(1503424, 56)
(508438, 56)
save done  3
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:39<00:00,  1.42s/it]


Memory usage of dataframe is 552.57 MB
Memory usage after optimization is: 552.57 MB
Decreased by 0.0%
(1503424, 70)
(508438, 70)
save done  4


In [None]:
with open('./Golden_Itemseq_var_agg.pickle', 'rb') as f:
    agg_list = pickle.load(f)
    
print(agg_list)
target = 'item_seq_number'

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
trainp = pd.read_csv('../input/train.csv', usecols=[target])
testp = pd.read_csv('../input/test.csv', usecols=[target])
trainp.fillna(trainp.mean(), inplace=True)
testp.fillna(testp.mean(), inplace=True)
train = pd.concat([train, trainp], axis=1)
test = pd.concat([test, testp], axis=1)
del trainp, testp; gc.collect()

train_active = pd.read_feather('../features/train_active/categorical_features_train_active.feather')
test_active = pd.read_feather('../features/test_active/categorical_features_test_active.feather')
trainap = pd.read_csv('../input/train_active.csv', usecols=[target])
testap = pd.read_csv('../input/test_active.csv', usecols=[target])
trainap.fillna(trainap.mean(), inplace=True)
testap.fillna(testap.mean(), inplace=True)
train_active = pd.concat([train_active, trainap], axis=1)
test_active = pd.concat([test_active, testap], axis=1)
del trainap, testap; gc.collect()

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

df = pd.concat([train, test, train_active, test_active])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for cols in tqdm(agg_list):
    group_cols = cols[:-1]
    assert target == cols[-1]
    print(group_cols)
    aggname = '-'.join(group_cols+[target]) + '-var'
    gp = df[group_cols+[target]].groupby(group_cols)[target].var().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(df[aggname].mean())
    df[aggname] = df[aggname].astype('float32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats+[target], axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]
train_active = df[lentrain+lentest: lentrain+lentest+lentrainactive]
test_active = df[lentrain+lentest+lentrainactive: lentrain+lentest+lentrainactive+lentestactive]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_active.reset_index(drop=True, inplace=True)
test_active.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

train.to_feather('../features/train/Agg_Itemseq_var_Golden_features_train.feather')
test.to_feather('../features/test/Agg_Itemseq_var_Golden_features_test.feather')
train_active.to_feather('../features/train_active/Agg_Itemseq_var_Golden_features_train_active.feather')
test_active.to_feather('../features/test_active/Agg_Itemseq_var_Golden_features_test_active.feather')

print('done')
train.nunique()

In [16]:
target = 'item_seq_number'
for i in range(1, 5):
    train = pd.read_feather('../features/train/categorical_features_train.feather')
    test = pd.read_feather('../features/test/categorical_features_test.feather')

    trainp = pd.read_csv('../input/train.csv', usecols=[target])
    testp = pd.read_csv('../input/test.csv', usecols=[target])
    trainp.fillna(trainp.mean(), inplace=True)
    testp.fillna(testp.mean(), inplace=True)
    train = pd.concat([train, trainp], axis=1)
    test = pd.concat([test, testp], axis=1)
    del trainp, testp; gc.collect()
    print(train.shape)
    print(test.shape)

    df = pd.concat([train, test])
    df.drop(nonaggfeats, axis=1, inplace=True)
    del train, test; gc.collect()
    print(df.shape)

    for comb in tqdm(list(itertools.combinations(aggfeats, i))):
        group_cols = list(comb)
        aggname = '-'.join(group_cols+[target]) + '-median'
        gp = df[group_cols+[target]].groupby(group_cols)[target].median().rename(aggname).to_frame().reset_index()
        df = df.merge(gp, on=group_cols, how='left')
        df[aggname] = df[aggname].fillna(df[aggname].mean())
        df[aggname] = df[aggname].astype('float32')
        gc.collect()
    df.drop(aggfeats+[target], axis=1, inplace=True)
    df = reduce_mem_usage(df)
    train = df[:lentrain]
    test = df[lentrain:]
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    print(train.shape)
    print(test.shape)
    train.to_feather('../features/train/Agg_by_Itemseq_median_'+str(i)+'_train.feather')
    test.to_feather('../features/test/Agg_by_Itemseq_median_'+str(i)+'_test.feather')
    print('save done ',str(i))
    del train, test, df; gc.collect()
    

(1503424, 12)
(508438, 12)
(2011862, 9)


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.84it/s]


Memory usage of dataframe is 76.75 MB
Memory usage after optimization is: 76.75 MB
Decreased by 0.0%
(1503424, 8)
(508438, 8)
save done  1
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:23<00:00,  1.17it/s]


Memory usage of dataframe is 230.24 MB
Memory usage after optimization is: 230.24 MB
Decreased by 0.0%
(1503424, 28)
(508438, 28)
save done  2
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [01:05<00:00,  1.17s/it]


Memory usage of dataframe is 445.13 MB
Memory usage after optimization is: 445.13 MB
Decreased by 0.0%
(1503424, 56)
(508438, 56)
save done  3
(1503424, 12)
(508438, 12)
(2011862, 9)


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:40<00:00,  1.43s/it]


Memory usage of dataframe is 552.57 MB
Memory usage after optimization is: 552.57 MB
Decreased by 0.0%
(1503424, 70)
(508438, 70)
save done  4


In [None]:
with open('./Golden_Itemseq_median_agg.pickle', 'rb') as f:
    agg_list = pickle.load(f)
    
print(agg_list)
target = 'item_seq_number'

train = pd.read_feather('../features/train/categorical_features_train.feather')
test = pd.read_feather('../features/test/categorical_features_test.feather')
trainp = pd.read_csv('../input/train.csv', usecols=[target])
testp = pd.read_csv('../input/test.csv', usecols=[target])
trainp.fillna(trainp.mean(), inplace=True)
testp.fillna(testp.mean(), inplace=True)
train = pd.concat([train, trainp], axis=1)
test = pd.concat([test, testp], axis=1)
del trainp, testp; gc.collect()

train_active = pd.read_feather('../features/train_active/categorical_features_train_active.feather')
test_active = pd.read_feather('../features/test_active/categorical_features_test_active.feather')
trainap = pd.read_csv('../input/train_active.csv', usecols=[target])
testap = pd.read_csv('../input/test_active.csv', usecols=[target])
trainap.fillna(trainap.mean(), inplace=True)
testap.fillna(testap.mean(), inplace=True)
train_active = pd.concat([train_active, trainap], axis=1)
test_active = pd.concat([test_active, testap], axis=1)
del trainap, testap; gc.collect()

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

df = pd.concat([train, test, train_active, test_active])
df.drop(nonaggfeats, axis=1, inplace=True)
print(df.shape)

for cols in tqdm(agg_list):
    group_cols = cols[:-1]
    assert target == cols[-1]
    print(group_cols)
    aggname = '-'.join(group_cols+[target]) + '-median'
    gp = df[group_cols+[target]].groupby(group_cols)[target].median().rename(aggname).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    df[aggname] = df[aggname].fillna(df[aggname].mean())
    df[aggname] = df[aggname].astype('float32')
    del gp; gc.collect()

print(df.shape)
df.head()

df = reduce_mem_usage(df)
df.drop(aggfeats+[target], axis=1, inplace=True)
train = df[:lentrain]
test = df[lentrain:lentrain+lentest]
train_active = df[lentrain+lentest: lentrain+lentest+lentrainactive]
test_active = df[lentrain+lentest+lentrainactive: lentrain+lentest+lentrainactive+lentestactive]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_active.reset_index(drop=True, inplace=True)
test_active.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)

train.to_feather('../features/train/Agg_Itemseq_median_Golden_features_train.feather')
test.to_feather('../features/test/Agg_Itemseq_median_Golden_features_test.feather')
train_active.to_feather('../features/train_active/Agg_Itemseq_median_Golden_features_train_active.feather')
test_active.to_feather('../features/test_active/Agg_Itemseq_median_Golden_features_test_active.feather')

print('done')
train.nunique()

In [11]:
# User ID 
used_cols = ['item_id', 'user_id', 'parent_category_name', 'category_name', 'region', 'city', 'param_1', 'price']

train = pd.read_csv('../input/train.csv', usecols=used_cols)
train_active = pd.read_csv('../input/train_active.csv', usecols=used_cols)
test = pd.read_csv('../input/test.csv', usecols=used_cols)
test_active = pd.read_csv('../input/test_active.csv', usecols=used_cols)

train_periods = pd.read_csv('../input/periods_train.csv', parse_dates=['activation_date', 'date_from', 'date_to'])
test_periods = pd.read_csv('../input/periods_test.csv', parse_dates=['activation_date', 'date_from', 'date_to'])

print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)
print(train_periods.shape)
print(test_periods.shape)

train.head()

(1503424, 8)
(508438, 8)
(14129821, 8)
(12824068, 8)
(16687412, 4)
(13724922, 4)


Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,price
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,400.0
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,3000.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",4000.0
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,2200.0
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,40000.0


In [12]:
all_samples = pd.concat([
    train,
    train_active,
    test,
    test_active
]).reset_index(drop=True)
all_samples.drop_duplicates(['item_id'], inplace=True)

del train_active, test_active; gc.collect()
print(all_samples.shape)

all_periods = pd.concat([
    train_periods,
    test_periods
])

del train_periods, test_periods; gc.collect()

print(all_periods.shape)

(19465016, 8)
(30412334, 4)


In [17]:
all_periods['days_up'] = all_periods['date_to'].dt.dayofyear - all_periods['date_from'].dt.dayofyear
all_periods['days_headdiff'] = all_periods['date_from'].dt.dayofyear - all_periods['activation_date'].dt.dayofyear
#all_periods['days_taildiff'] = all_periods['date_to'].dt.dayofyear - all_periods['activation_date'].dt.dayofyear

gp = all_periods.groupby(['item_id'])[['days_up']]

gp_df = pd.DataFrame()
gp_df['days_up_sum'] = gp.sum()['days_up']
gp_df['times_put_up'] = gp.count()['days_up']
gp_df.reset_index(inplace=True)
gp_df.rename(index=str, columns={'index': 'item_id'})

print(gp_df.shape)
gp_df.head()

(17453073, 3)


Unnamed: 0,item_id,days_up_sum,times_put_up
0,00000077ff21,13,1
1,000002c54018,6,1
2,000005570503,1,1
3,0000060018e6,6,1
4,000006497719,19,2


In [18]:
gp_1 = all_periods.groupby(['item_id'])[['days_headdiff']]

gp_df_1 = pd.DataFrame()
gp_df_1['days_headdiff_sum'] = gp_1.sum()['days_headdiff']
gp_df_1.reset_index(inplace=True)
gp_df_1.rename(index=str, columns={'index': 'item_id'})

print(gp_df_1.shape)
gp_df_1.head()

(17453073, 2)


Unnamed: 0,item_id,days_headdiff_sum
0,00000077ff21,9.0
1,000002c54018,12.0
2,000005570503,0.0
3,0000060018e6,6.0
4,000006497719,18.0


In [19]:
del gp, gp_1; gc.collect()

all_periods.drop_duplicates(['item_id'], inplace=True)
all_periods = all_periods.merge(gp_df, on='item_id', how='left')
del gp_df; gc.collect()
all_periods = all_periods.merge(gp_df_1, on='item_id', how='left')
del gp_df_1; gc.collect()
#all_periods = all_periods.merge(gp_df_2, on='item_id', how='left')
#del gp_df_2; gc.collect()

print(all_periods.shape)
all_periods.head()

(17453073, 10)


Unnamed: 0,item_id,activation_date,date_from,date_to,days_up,days_headdiff,days_taildiff,days_up_sum,times_put_up,days_headdiff_sum
0,8f5caef7afb0,2017-02-14,2017-03-15,2017-03-16,1,29.0,30.0,17,4,55.0
1,66218ff526d1,2017-02-16,2017-03-15,2017-03-18,3,27.0,30.0,18,3,51.0
2,b237d9539b21,2017-03-01,2017-03-15,2017-03-28,13,14.0,27.0,19,2,26.0
3,80bf58082ad3,2017-03-19,2017-03-19,2017-03-28,9,0.0,9.0,17,4,51.0
4,67a9944a7373,2017-03-14,2017-03-15,2017-03-28,13,1.0,14.0,18,3,30.0


In [22]:
all_periods = all_periods.merge(all_samples, on='item_id', how='left')
print(all_periods.shape)
all_periods.head()

(17453073, 16)


Unnamed: 0,item_id,activation_date,date_from,date_to,days_up,days_headdiff,days_up_sum,times_put_up,days_headdiff_sum,user_id,region,city,parent_category_name,category_name,param_1,price
0,8f5caef7afb0,2017-02-14,2017-03-15,2017-03-16,1,29.0,17,4,55.0,e292cce69842,Нижегородская область,Нижний Новгород,Услуги,Предложение услуг,"Транспорт, перевозки",0.0
1,66218ff526d1,2017-02-16,2017-03-15,2017-03-18,3,27.0,18,3,51.0,a326c04a24ec,Новосибирская область,Новосибирск,Личные вещи,Детская одежда и обувь,Для мальчиков,200.0
2,b237d9539b21,2017-03-01,2017-03-15,2017-03-28,13,14.0,19,2,26.0,06d275498a56,Свердловская область,Екатеринбург,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,4000.0
3,80bf58082ad3,2017-03-19,2017-03-19,2017-03-28,9,0.0,17,4,51.0,831c8c4a622c,Краснодарский край,Краснодар,Для дома и дачи,Бытовая техника,Для кухни,0.0
4,67a9944a7373,2017-03-14,2017-03-15,2017-03-28,13,1.0,18,3,30.0,248102e50d79,Пермский край,Пермь,Личные вещи,Товары для детей и игрушки,Игрушки,5489.0


In [24]:
all_periods.drop(['activation_date', 'date_from', 'date_to'], axis=1, inplace=True)
gc.collect()

282

In [25]:
gp = all_periods.groupby(['user_id'])[['days_up_sum', 'times_put_up', 'days_headdiff_sum']].mean().reset_index() \
    .rename(index=str, columns={
        'days_up_sum': 'avg_days_up_user',
        'times_put_up': 'avg_times_up_user',
        'days_headdiff_sum': 'avg_headdays_up_user'
    })
gp.head()

Unnamed: 0,user_id,avg_days_up_user,avg_times_up_user,avg_headdays_up_user
0,00000077ff21,12.5,2.0,23.5
1,000006497719,19.0,2.0,13.0
2,00000b4d72f6,3.0,1.0,0.0
3,00000d642d7e,13.0,1.0,14.0
4,0000126b80a4,12.0,1.75,11.625


In [26]:
n_user_items = all_periods.groupby(['user_id'])[['item_id']].count().reset_index() \
    .rename(index=str, columns={
        'item_id': 'n_user_items'
    })
gp = gp.merge(n_user_items, on='user_id', how='outer')

gp.head()

Unnamed: 0,user_id,avg_days_up_user,avg_times_up_user,avg_headdays_up_user,n_user_items
0,00000077ff21,12.5,2.0,23.5,2
1,000006497719,19.0,2.0,13.0,1
2,00000b4d72f6,3.0,1.0,0.0,1
3,00000d642d7e,13.0,1.0,14.0,2
4,0000126b80a4,12.0,1.75,11.625,8


In [29]:
all_periods.head()

Unnamed: 0,item_id,days_up,days_headdiff,days_up_sum,times_put_up,days_headdiff_sum,user_id,region,city,parent_category_name,category_name,param_1,price
0,8f5caef7afb0,1,29.0,17,4,55.0,e292cce69842,Нижегородская область,Нижний Новгород,Услуги,Предложение услуг,"Транспорт, перевозки",0.0
1,66218ff526d1,3,27.0,18,3,51.0,a326c04a24ec,Новосибирская область,Новосибирск,Личные вещи,Детская одежда и обувь,Для мальчиков,200.0
2,b237d9539b21,13,14.0,19,2,26.0,06d275498a56,Свердловская область,Екатеринбург,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,4000.0
3,80bf58082ad3,9,0.0,17,4,51.0,831c8c4a622c,Краснодарский край,Краснодар,Для дома и дачи,Бытовая техника,Для кухни,0.0
4,67a9944a7373,13,1.0,18,3,30.0,248102e50d79,Пермский край,Пермь,Личные вещи,Товары для детей и игрушки,Игрушки,5489.0


In [30]:
n_user_items = all_periods.groupby(['user_id'])[['price']].mean().reset_index() \
    .rename(index=str, columns={
        'price': 'avg_price_up_user'
    })
gp = gp.merge(n_user_items, on='user_id', how='outer')

gp.head()

Unnamed: 0,user_id,avg_days_up_user,avg_times_up_user,avg_headdays_up_user,n_user_items,avg_price_up_user
0,00000077ff21,12.5,2.0,23.5,2,2750.0
1,000006497719,19.0,2.0,13.0,1,2950000.0
2,00000b4d72f6,3.0,1.0,0.0,1,100000.0
3,00000d642d7e,13.0,1.0,14.0,2,25500.0
4,0000126b80a4,12.0,1.75,11.625,8,2043.75


In [31]:
n_user_items = all_periods.groupby(['user_id'])[['price']].var().reset_index() \
    .rename(index=str, columns={
        'price': 'var_price_up_user'
    })
gp = gp.merge(n_user_items, on='user_id', how='outer')

gp.head()

Unnamed: 0,user_id,avg_days_up_user,avg_times_up_user,avg_headdays_up_user,n_user_items,avg_price_up_user,var_price_up_user
0,00000077ff21,12.5,2.0,23.5,2,2750.0,15125000.0
1,000006497719,19.0,2.0,13.0,1,2950000.0,
2,00000b4d72f6,3.0,1.0,0.0,1,100000.0,
3,00000d642d7e,13.0,1.0,14.0,2,25500.0,1200500000.0
4,0000126b80a4,12.0,1.75,11.625,8,2043.75,11947430.0


In [32]:
gp.isnull().sum()

user_id                       0
avg_days_up_user              0
avg_times_up_user             0
avg_headdays_up_user          0
n_user_items                  0
avg_price_up_user        119115
var_price_up_user       1933071
dtype: int64

In [33]:
gp['avg_price_up_user'] = gp['avg_price_up_user'].fillna(gp['avg_price_up_user'].mean())
gp['var_price_up_user'] = gp['var_price_up_user'].fillna(0)

gp.head()

Unnamed: 0,user_id,avg_days_up_user,avg_times_up_user,avg_headdays_up_user,n_user_items,avg_price_up_user,var_price_up_user
0,00000077ff21,12.5,2.0,23.5,2,2750.0,15125000.0
1,000006497719,19.0,2.0,13.0,1,2950000.0,0.0
2,00000b4d72f6,3.0,1.0,0.0,1,100000.0,0.0
3,00000d642d7e,13.0,1.0,14.0,2,25500.0,1200500000.0
4,0000126b80a4,12.0,1.75,11.625,8,2043.75,11947430.0


In [35]:
gp.to_csv('../input/agg_user.csv', index=False)

In [12]:
# To feather user info
used_cols = ['user_id']

train = pd.read_csv('../input/train.csv', usecols=used_cols)
train_active = pd.read_csv('../input/train_active.csv', usecols=used_cols)
test = pd.read_csv('../input/test.csv', usecols=used_cols)
test_active = pd.read_csv('../input/test_active.csv', usecols=used_cols)

agguser = pd.read_csv('../input/agg_user.csv')

df = pd.concat([train, test , train_active, test_active])
df.shape


(28965751, 1)

In [13]:
df = df.merge(agguser, on='user_id', how='left')
print(df.shape)
df.head()

(28965751, 7)


Unnamed: 0,user_id,avg_days_up_user,avg_times_up_user,avg_headdays_up_user,n_user_items,avg_price_up_user,var_price_up_user
0,e00f8ff2eaf9,8.0,2.0,17.0,1.0,4000.0,0.0
1,39aeb48f0017,,,,,,
2,91e2f88dd6e3,4.428571,1.142857,6.714286,7.0,1035.714286,383928.6
3,bf5cccea572d,16.714286,2.642857,30.392857,28.0,829.642857,1923263.0
4,ef50846afc0b,,,,,,


In [14]:
df.drop('user_id', axis=1, inplace=True)
print(df.shape)
df.head()

(28965751, 6)


Unnamed: 0,avg_days_up_user,avg_times_up_user,avg_headdays_up_user,n_user_items,avg_price_up_user,var_price_up_user
0,8.0,2.0,17.0,1.0,4000.0,0.0
1,,,,,,
2,4.428571,1.142857,6.714286,7.0,1035.714286,383928.6
3,16.714286,2.642857,30.392857,28.0,829.642857,1923263.0
4,,,,,,


In [15]:
df.mean().shape

(6,)

In [16]:
df = df.fillna(df.mean())
df.isnull().sum()

avg_days_up_user        0
avg_times_up_user       0
avg_headdays_up_user    0
n_user_items            0
avg_price_up_user       0
var_price_up_user       0
dtype: int64

In [19]:
df = reduce_mem_usage(df)
df.reset_index(drop=True, inplace=True)

train = df[:lentrain]
test = df[lentrain:lentrain+lentest]
train_active = df[lentrain+lentest: lentrain+lentest+lentrainactive]
test_active = df[lentrain+lentest+lentrainactive: lentrain+lentest+lentrainactive+lentestactive]


print(train.shape)
print(test.shape)
print(train_active.shape)
print(test_active.shape)


Memory usage of dataframe is 1546.94 MB
Memory usage after optimization is: 883.96 MB
Decreased by 42.9%
(1503424, 6)
(508438, 6)
(14129821, 6)
(12824068, 6)


In [21]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train_active.reset_index(drop=True, inplace=True)
test_active.reset_index(drop=True, inplace=True)

train.to_feather('../features/train/Agg_User_train.feather')
test.to_feather('../features/test/Agg_User_test.feather')
train_active.to_feather('../features/train_active/Agg_User_train_active.feather')
test_active.to_feather('../features/test_active/Agg_User_test_active.feather')
