## Step 0:  Set up <a class="anchor" id="setup"></a>

In [4]:
# Import standard open libraries
import pandas as pd

# AWS libraries and initialization
import boto3

%store -r df_raw
%store -r s3_bucket_name
%store -r s3_prefix
%store -r start_time
%store -r end_time
%store -r item_id
%store -r target_value
%store -r timestamp
%store -r forecast_dims
%store -r FORECAST_FREQ

## Step 23. Assemble and save metadata (if any) <a class="anchor" id="IM"></a>

In [6]:
dims_except_timestamp = [i for i in forecast_dims if i != timestamp]
print(f"dims_except_timestamp = {dims_except_timestamp}")

# Identify metadata columns
im = df_raw[[item_id, 'pickup_borough']].copy()
im = im.groupby(dims_except_timestamp).first()
im.reset_index(inplace=True)
# check nulls
display(im.isnull().sum())
im.sample(2)

dims_except_timestamp = ['pulocationid']


pulocationid      0
pickup_borough    0
dtype: int64

Unnamed: 0,pulocationid,pickup_borough
145,232,Manhattan
72,167,Bronx


In [7]:
# Additional metadata created by binning just item target_value is sometimes useful.

# aggregate pickups by item (locations)
synthetic = df_raw.copy()
synthetic = (synthetic.groupby(item_id)
        .agg({target_value: ['max']}))

synthetic = synthetic.reset_index()
synthetic.sample(5)

#bin data into 4 categories
cat_scales = ["Cat_{}".format(i) for i in range(1,5)]
synthetic['item_cat_by_max'] = list(pd.cut(synthetic[target_value]['max'].values, 4, labels=cat_scales))

synthetic.drop(target_value, axis=1, inplace=True)
synthetic.columns = synthetic.columns.get_level_values(0)

print(synthetic.shape)
print(synthetic.dtypes)
print(synthetic.columns)
display(synthetic.sample(5))
print(synthetic.item_cat_by_max.value_counts(dropna=False))

# merge synthetic features
im = im.merge(synthetic, how="left", on=[item_id])
print(im.shape, synthetic.shape)
im.head()

(260, 2)
pulocationid       object
item_cat_by_max    object
dtype: object
Index(['pulocationid', 'item_cat_by_max'], dtype='object')


  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,pulocationid,item_cat_by_max
254,94,Cat_1
141,229,Cat_1
192,38,Cat_1
87,180,Cat_1
167,252,Cat_1


Cat_1    234
Cat_2     20
Cat_3      4
Cat_4      2
Name: item_cat_by_max, dtype: int64
(260, 3) (260, 2)


Unnamed: 0,pulocationid,pickup_borough,item_cat_by_max
0,1,EWR,Cat_1
1,10,Queens,Cat_1
2,100,Manhattan,Cat_1
3,101,Queens,Cat_1
4,102,Queens,Cat_1


In [8]:
# check metadata so far

print(im.shape)
if im.shape[0] < 50:
    display(im)
else:
    display(im.head())

# check cardinality of metadata columns
im.describe()

(260, 3)


Unnamed: 0,pulocationid,pickup_borough,item_cat_by_max
0,1,EWR,Cat_1
1,10,Queens,Cat_1
2,100,Manhattan,Cat_1
3,101,Queens,Cat_1
4,102,Queens,Cat_1


Unnamed: 0,pulocationid,pickup_borough,item_cat_by_max
count,260,260,260
unique,260,6,4
top,67,Queens,Cat_1
freq,1,69,234


In [11]:
# merge in sparse or not column
#im['is_sparse'] = 0

#im.loc[(im[item_id].isin(list(sparse[item_id].unique()))), 'is_sparse'] = 1
#print(im.is_sparse.value_counts(dropna=False))
#im.sample(5)

In [12]:
# merge in top-moving or not column
#im['top_moving'] = 0

#im.loc[(im[item_id].isin(list(top_moving_items[item_id].unique()))), 'top_moving'] = 1
#print(im.top_moving.value_counts(dropna=False))
#im.sample(5)

In [13]:
# merge in time series categories column
#categories_df = full_history.groupby([item_id])[item_id, 'ts_type'].first()
#categories_df.reset_index(inplace=True, drop=True)
# categories_df.head(2)

#im = im.merge(categories_df, how="left", on=[item_id])
#print(im.ts_type.value_counts(dropna=False))
#im.sample(5)

In [14]:
# Assemble metadata just columns you want

im = im.iloc[:, 0:3].groupby(item_id).max()
im.reset_index(inplace=True)
print(im.shape)
print("checking nulls..")
print(im.isnull().sum())
im.sample(5)

(260, 3)
checking nulls..
pulocationid       0
pickup_borough     0
item_cat_by_max    0
dtype: int64


Unnamed: 0,pulocationid,pickup_borough,item_cat_by_max
18,118,Staten Island,Cat_1
124,213,Bronx,Cat_1
147,234,Manhattan,Cat_2
163,249,Manhattan,Cat_2
110,200,Bronx,Cat_1


In [16]:
# Save im to S3
local_file = "im.csv"
# Save merged file locally
im.to_csv(local_file, header=False, index=False)

key = f"{s3_prefix}/{local_file}"
boto3.Session().resource('s3').Bucket(s3_bucket_name).Object(key).upload_file(local_file)