In [1]:
"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding more
average features and weekly average features on it.
"""
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [2]:
import boto3
import pandas as pd
import io
import gzip
s3 = boto3.resource('s3')
bucket = 'twde-datalab'
train_key = 'raw/train.csv'
test_key = 'raw/test.csv'
items_key = 'raw/items.csv'

In [40]:
dtypes = {'id':'uint32', 'item_nbr':'int32', 'store_nbr':'int8', 'unit_sales':'float32'}

def get_testdf(key):
    obj = s3.Object(bucket,key)
    data = obj.get()['Body'].read()
    df = pd.read_csv(io.BytesIO(data), 
                    usecols=[0, 1, 2, 3, 4],
                    dtype={'onpromotion': bool},
                    parse_dates=["date"]
                    ).set_index( ['item_nbr','store_nbr'] )
    return df
def get_traindf(key):
    obj = s3.Object(bucket,key)
    data = obj.get()['Body'].read()
    traindf = pd.read_csv(io.BytesIO(data), 
                          usecols=[1, 2, 3, 4],
                          dtype=dtypes,
                          parse_dates=["date"],
                          skiprows=range(1,124035460) #only the August 2017 data
                          )
    return traindf

def get_itemsdf(key):
    obj = s3.Object(bucket,key)
    data = obj.get()['Body'].read()
    itemsdf = pd.read_csv(io.BytesIO(data)).set_index("item_nbr")
    return itemsdf

def save_s3(df, key):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    csv_buffer.seek(0)
    gz_buffer = io.BytesIO()

    with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
        gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8'))

    s3_object = s3.Object(bucket, key)
    s3_object.put(Body=gz_buffer.getvalue())

In [4]:
train = get_traindf(train_key)
train.loc[(train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives

In [41]:
test = get_testdf(test_key)
items = get_itemsdf(items_key)

In [28]:
test.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion
0,125497040,2017-08-16,1,96995,False
1,125497041,2017-08-16,1,99197,False
2,125497042,2017-08-16,1,103501,False
3,125497043,2017-08-16,1,103520,False
4,125497044,2017-08-16,1,103665,False


In [37]:
train_median = train.groupby( ['item_nbr','store_nbr'] )['unit_sales'].median().to_frame('unit_sales')

In [43]:
train_median.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,unit_sales
item_nbr,store_nbr,Unnamed: 2_level_1
96995,1,2.0
96995,3,1.0
96995,4,1.0
96995,5,1.0
96995,6,1.0


In [44]:
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,date,onpromotion
item_nbr,store_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
96995,1,125497040,2017-08-16,False
99197,1,125497041,2017-08-16,False
103501,1,125497042,2017-08-16,False
103520,1,125497043,2017-08-16,False
103665,1,125497044,2017-08-16,False


In [39]:
train_median.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,unit_sales
item_nbr,store_nbr,Unnamed: 2_level_1
96995,1,2.0
96995,3,1.0
96995,4,1.0
96995,5,1.0
96995,6,1.0


In [47]:
median_recent = test.join(train_median, how='left').fillna(0)

In [51]:
median_recent_submission = median_recent[['id','unit_sales']]

In [52]:
median_recent_submission.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,unit_sales
item_nbr,store_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,1,125497040,2.0
96995,1,125707694,2.0
96995,1,125918348,2.0
96995,1,126129002,2.0
96995,1,126339656,2.0


In [54]:
save_s3(median_recent_submission, 'submission/median_recent.csv.gz')