In [1]:
import itertools
import pandas as pd
import numpy as np

In [2]:
# https://www.kaggle.com/ceshine/mean-baseline-lb-59/code
types = {'id': 'int32',
         'date': 'str',
         'item_nbr': 'int32',
         'store_nbr': 'int16',
         'unit_sales': 'float32',
         'onpromotion': bool }

df_train = pd.read_csv(
    './data/train.csv', usecols=[1, 2, 3, 4, 5], dtype=types,
    converters={'unit_sales': lambda u: float(u) if float(u) > 0 else 0},
    skiprows=range(1, 124035460)
)

In [3]:
df_train["unit_sales"] = np.log1p(df_train["unit_sales"])

In [4]:
u_dates = df_train.date.unique()
u_stores = df_train.store_nbr.unique()
u_items = df_train.item_nbr.unique()
df_train.set_index(["date", "store_nbr", "item_nbr"], inplace=True)
df_train = df_train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=["date", "store_nbr", "item_nbr"]
    )
)
# Fill NAs
df_train.loc[:, "unit_sales"].fillna(0, inplace=True)
# Assume missing entris imply no promotion
df_train.loc[:, "onpromotion"].fillna(False, inplace=True)

In [5]:
df_train[df_train.unit_sales == 0].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales,onpromotion
date,store_nbr,item_nbr,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-08-02,1,105577,0.0,False
2017-08-02,1,105693,0.0,False
2017-08-02,1,105737,0.0,False
2017-08-02,1,108079,0.0,False
2017-08-02,1,108831,0.0,False


In [6]:
df_train.reset_index(inplace=True)

In [7]:
df_train = df_train.groupby(['item_nbr', 'store_nbr', 'onpromotion'])['unit_sales'].mean().to_frame('unit_sales')

In [8]:
df_train["unit_sales"] = df_train["unit_sales"].apply(np.expm1)
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
item_nbr,store_nbr,onpromotion,Unnamed: 3_level_1
96995,1,False,0.397155
96995,2,False,0.0
96995,3,False,0.455771
96995,4,False,0.10409
96995,5,False,0.10409


In [9]:
# Create submission
test = pd.read_csv(
    "./data/test.csv", usecols=[0, 2, 3, 4], dtype={'id': 'int32',
         'date': 'str',
         'item_nbr': 'int32',
         'store_nbr': 'int16',
         'unit_sales': 'float32',
         'onpromotion': bool }
).set_index(
    ['item_nbr', 'store_nbr', 'onpromotion']
).join(
    df_train, how='left'
).fillna(0.0)

In [11]:
test.reset_index(inplace=True)

In [22]:
items_cols = pd.read_csv('./data/items_encoded.csv', nrows=1).columns
bit_cols = list(set(items_cols)-{'item_nbr'})
item_types = dict(map(lambda x: (x, bool), bit_cols))
items = pd.read_csv('./data/items_encoded.csv', dtype=item_types)

stores_cols = pd.read_csv('./data/stores_encoded.csv', nrows=1).columns
bit_cols = list(set(stores_cols)-{'store_nbr'})
stores_types = dict(map(lambda x: (x, bool), bit_cols))
stores = pd.read_csv('./data/stores_encoded.csv', dtype=stores_types)

test_ext = test.merge(items, on='item_nbr')
test_ext = test_ext.merge(stores, on='store_nbr')

In [23]:
test_ext['store_item_tuple'] = list(test_ext[['item_nbr', 'store_nbr']].itertuples(index=False))
test_ext.head()

Unnamed: 0,item_nbr,store_nbr,onpromotion,id,unit_sales,family_AUTOMOTIVE,family_BABY CARE,family_BEAUTY,family_BEVERAGES,family_BOOKS,...,cluster_17,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,store_item_tuple
0,96995,1,False,125497040,0.397155,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,"(96995, 1)"
1,96995,1,False,125707694,0.397155,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,"(96995, 1)"
2,96995,1,False,125918348,0.397155,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,"(96995, 1)"
3,96995,1,False,126129002,0.397155,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,"(96995, 1)"
4,96995,1,False,126339656,0.397155,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,"(96995, 1)"


In [24]:
n_train = pd.read_csv('./data/n_train.csv')
n_train_tuples = list(n_train.itertuples(index=False))
test_known = test_ext[~test_ext.store_item_tuple.isin(n_train_tuples)]
test_unknown = test_ext[test_ext.store_item_tuple.isin(n_train_tuples)]
del test_ext

In [36]:
from sklearn.neighbors import KNeighborsRegressor
from scipy import sparse
cols_X = list(set(test_known.columns)-{'item_nbr', 'store_nbr', 'unit_sales', 'id', 'store_item_tuple'})
train_X = sparse.csr_matrix(test_known[cols_X])
train_y = test_known['unit_sales'].fillna(0.0)
test_X = sparse.csr_matrix(test_unknown[cols_X])

In [40]:
import lightgbm as lgb

In [38]:
model = KNeighborsRegressor(n_neighbors=3)
model.fit(train_X, train_y)
test_y = model.predict(test_X)

MemoryError: 

In [None]:
.fillna(0).to_csv(
    './submissions/kernel_mean_baseline.csv.gz', float_format='%.2f', index=None, compression="gzip"
)