In [1]:
import numpy as np 
import pandas as pd 

import datetime

from tsfresh import (extract_features, extract_relevant_features, select_features)
from tsfresh.feature_extraction import settings
from tsfresh.utilities.dataframe_functions import impute

In [2]:
train = pd.read_csv('train.csv',index_col='id')
test = pd.read_csv('test.csv',index_col='id')

In [3]:
train

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,2013-01-01,1,BABY CARE,0.000,0
2,2013-01-01,1,BEAUTY,0.000,0
3,2013-01-01,1,BEVERAGES,0.000,0
4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,438.133,0
3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [4]:
train_16_days = train.query("date >= '2017-07-31' ")
def exp_mean_ln(df):
    return np.expm1(np.mean(np.log1p(df['sales'])))
train_average = train_16_days.groupby(['store_nbr', 'family']).apply(exp_mean_ln).to_dict()

In [11]:
train_16_days

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2972376,2017-07-31,1,AUTOMOTIVE,8.000,0
2972377,2017-07-31,1,BABY CARE,0.000,0
2972378,2017-07-31,1,BEAUTY,3.000,0
2972379,2017-07-31,1,BEVERAGES,2414.000,24
2972380,2017-07-31,1,BOOKS,1.000,0
...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,438.133,0
3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [10]:
train_average

{(1, 'AUTOMOTIVE'): 4.280821316459616,
 (1, 'BABY CARE'): 0.0,
 (1, 'BEAUTY'): 3.3918733701608357,
 (1, 'BEVERAGES'): 1893.2864301263335,
 (1, 'BOOKS'): 0.04427378242741384,
 (1, 'BREAD/BAKERY'): 291.8630123910542,
 (1, 'CELEBRATION'): 7.641382799839562,
 (1, 'CLEANING'): 541.2789201201812,
 (1, 'DAIRY'): 581.9465174822476,
 (1, 'DELI'): 113.4986066863138,
 (1, 'EGGS'): 120.20387650148035,
 (1, 'FROZEN FOODS'): 114.21836140236431,
 (1, 'GROCERY I'): 2162.3437442964223,
 (1, 'GROCERY II'): 15.66027532770029,
 (1, 'HARDWARE'): 1.2279870057861473,
 (1, 'HOME AND KITCHEN I'): 21.51760994010792,
 (1, 'HOME AND KITCHEN II'): 22.331206761929582,
 (1, 'HOME APPLIANCES'): 0.07107548307291447,
 (1, 'HOME CARE'): 150.12149714547004,
 (1, 'LADIESWEAR'): 11.899024266310901,
 (1, 'LAWN AND GARDEN'): 15.671250937479225,
 (1, 'LINGERIE'): 5.756134606997616,
 (1, 'LIQUOR,WINE,BEER'): 81.92102009965444,
 (1, 'MAGAZINES'): 5.582924969261206,
 (1, 'MEATS'): 259.50500545444794,
 (1, 'PERSONAL CARE'): 146.6

In [8]:
test['sales'] = test.set_index(['store_nbr', 'family']).index.map(train_average.get)

In [9]:
test

Unnamed: 0_level_0,date,store_nbr,family,onpromotion,sales
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3000888,2017-08-16,1,AUTOMOTIVE,0,4.280821
3000889,2017-08-16,1,BABY CARE,0,0.000000
3000890,2017-08-16,1,BEAUTY,2,3.391873
3000891,2017-08-16,1,BEVERAGES,20,1893.286430
3000892,2017-08-16,1,BOOKS,0,0.044274
...,...,...,...,...,...
3029395,2017-08-31,9,POULTRY,1,436.545569
3029396,2017-08-31,9,PREPARED FOODS,0,108.170800
3029397,2017-08-31,9,PRODUCE,1,1607.282218
3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9,141.254425


In [36]:
train_test = pd.concat([train, test], ignore_index=True)

In [37]:
#change date datatype as datetime
train_test.date = pd.to_datetime(train_test.date)

In [38]:
oil = pd.read_csv('oil.csv', parse_dates = ['date'], infer_datetime_format = True, index_col = 'date').to_period('D')

In [39]:
#filling missing data
oil = oil.interpolate(method='linear')
#the first row is still missing the value
oil.iloc[0] = oil.iloc[1]

#some days are skipped. Filling up the gap.

start_date = train_test.date.min() 
# from beggining of the train date and the end of test date
number_of_days = 1704 #1703
date_list = [(start_date + datetime.timedelta(days = day)).isoformat() for day in range(number_of_days)]

date = (pd.Series(date_list)).to_frame()
date.columns = ['date']
date.date = pd.to_datetime(date.date)
date['date_str'] = date.date.astype(str)
oil['date_str'] = oil.index.astype(str)

oil = pd.merge(date,oil,how='left',on='date_str')
oil = oil.set_index('date').dcoilwtico.interpolate(method='linear').to_frame()
oil['date_str'] = oil.index.astype(str)

In [40]:
oil

Unnamed: 0_level_0,dcoilwtico,date_str
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-01,93.140000,2013-01-01
2013-01-02,93.140000,2013-01-02
2013-01-03,92.970000,2013-01-03
2013-01-04,93.120000,2013-01-04
2013-01-05,93.146667,2013-01-05
...,...,...
2017-08-27,46.816667,2017-08-27
2017-08-28,46.400000,2017-08-28
2017-08-29,46.460000,2017-08-29
2017-08-30,45.960000,2017-08-30


In [41]:
train_test['date_str'] = train_test.date.astype(str)
train_test = pd.merge(train_test,oil,how='left',on='date_str')

In [42]:
train_test

Unnamed: 0,date,store_nbr,family,sales,onpromotion,date_str,dcoilwtico
0,2013-01-01,1,AUTOMOTIVE,0.000000,0,2013-01-01,93.14
1,2013-01-01,1,BABY CARE,0.000000,0,2013-01-01,93.14
2,2013-01-01,1,BEAUTY,0.000000,0,2013-01-01,93.14
3,2013-01-01,1,BEVERAGES,0.000000,0,2013-01-01,93.14
4,2013-01-01,1,BOOKS,0.000000,0,2013-01-01,93.14
...,...,...,...,...,...,...,...
3029395,2017-08-31,9,POULTRY,436.545569,1,2017-08-31,47.26
3029396,2017-08-31,9,PREPARED FOODS,108.170800,0,2017-08-31,47.26
3029397,2017-08-31,9,PRODUCE,1607.282218,1,2017-08-31,47.26
3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,141.254425,9,2017-08-31,47.26


In [43]:
#train_test = train_test.query("date >= '2016-01-01' ")
train_test = train_test.query("date >= '2017-08-01' ")

In [44]:
X = train_test.drop('sales', axis=1)
X = X.rename_axis('id').reset_index()
X = X[['id', 'date', 'onpromotion', 'dcoilwtico']]

In [45]:
X

Unnamed: 0,id,date,onpromotion,dcoilwtico
0,2974158,2017-08-01,0,49.19
1,2974159,2017-08-01,0,49.19
2,2974160,2017-08-01,0,49.19
3,2974161,2017-08-01,26,49.19
4,2974162,2017-08-01,0,49.19
...,...,...,...,...
55237,3029395,2017-08-31,1,47.26
55238,3029396,2017-08-31,0,47.26
55239,3029397,2017-08-31,1,47.26
55240,3029398,2017-08-31,9,47.26


In [46]:
#test_y = train_test[(train_test['date'] >= '2017-05-01')]
y = train_test['sales']

In [47]:
y

2974158       5.000000
2974159       0.000000
2974160       4.000000
2974161    2627.000000
2974162       0.000000
              ...     
3029395     436.545569
3029396     108.170800
3029397    1607.282218
3029398     141.254425
3029399      18.164282
Name: sales, Length: 55242, dtype: float64

In [50]:
relevant_features = extract_relevant_features(X, y, column_id='id', column_sort='date')

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 20/20 [18:03<00:00, 54.19s/it]


(55242, 27)

In [51]:
relevant_features

Unnamed: 0,onpromotion__sum_values,onpromotion__value_count__value_0,onpromotion__range_count__max_1__min_-1,"onpromotion__fft_coefficient__attr_""abs""__coeff_0","onpromotion__fft_coefficient__attr_""real""__coeff_0","onpromotion__cwt_coefficients__coeff_0__w_20__widths_(2, 5, 10, 20)","onpromotion__cwt_coefficients__coeff_0__w_10__widths_(2, 5, 10, 20)","onpromotion__cwt_coefficients__coeff_0__w_5__widths_(2, 5, 10, 20)","onpromotion__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)",onpromotion__quantile__q_0.9,...,onpromotion__mean,onpromotion__root_mean_square,onpromotion__maximum,onpromotion__median,onpromotion__minimum,onpromotion__quantile__q_0.1,onpromotion__quantile__q_0.2,onpromotion__absolute_maximum,onpromotion__value_count__value_1,onpromotion__benford_correlation
2974158,0.0,1.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.295657
2974159,0.0,1.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.295657
2974160,0.0,1.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.295657
2974161,26.0,0.0,0.0,26.0,26.0,5.042434,7.131079,10.084869,15.945577,26.0,...,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,0.0,0.295657
2974162,0.0,1.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.295657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029395,1.0,0.0,0.0,1.0,1.0,0.193940,0.274272,0.387880,0.613291,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.864123
3029396,0.0,1.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.295657
3029397,1.0,0.0,0.0,1.0,1.0,0.193940,0.274272,0.387880,0.613291,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.864123
3029398,9.0,0.0,0.0,9.0,9.0,1.745458,2.468450,3.490916,5.519623,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,0.0,-0.297356


In [53]:
relevant_features.to_csv('X-more-features.csv')

In [54]:
print(list(relevant_features))

['onpromotion__sum_values', 'onpromotion__value_count__value_0', 'onpromotion__range_count__max_1__min_-1', 'onpromotion__fft_coefficient__attr_"abs"__coeff_0', 'onpromotion__fft_coefficient__attr_"real"__coeff_0', 'onpromotion__cwt_coefficients__coeff_0__w_20__widths_(2, 5, 10, 20)', 'onpromotion__cwt_coefficients__coeff_0__w_10__widths_(2, 5, 10, 20)', 'onpromotion__cwt_coefficients__coeff_0__w_5__widths_(2, 5, 10, 20)', 'onpromotion__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)', 'onpromotion__quantile__q_0.9', 'onpromotion__quantile__q_0.8', 'onpromotion__quantile__q_0.7', 'onpromotion__quantile__q_0.6', 'onpromotion__count_below__t_0', 'onpromotion__quantile__q_0.3', 'onpromotion__quantile__q_0.4', 'onpromotion__abs_energy', 'onpromotion__mean', 'onpromotion__root_mean_square', 'onpromotion__maximum', 'onpromotion__median', 'onpromotion__minimum', 'onpromotion__quantile__q_0.1', 'onpromotion__quantile__q_0.2', 'onpromotion__absolute_maximum', 'onpromotion__value_count__va