In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Load items data

In [2]:
items = pd.read_csv('./data/items.csv')

In [3]:
items.head()

Unnamed: 0,item_nbr,family,class,perishable
0,96995,GROCERY I,1093,0
1,99197,GROCERY I,1067,0
2,103501,CLEANING,3008,0
3,103520,GROCERY I,1028,0
4,103665,BREAD/BAKERY,2712,1


### Looks like we have to encode item_nbr, family and class, perishable already is boolean

In [4]:
families = items['family'].unique()
families_encoded = pd.get_dummies(families)
families_encoded['family'] = families
families_encoded.head()

Unnamed: 0,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD,family
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GROCERY I
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,CLEANING
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,BREAD/BAKERY
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,DELI
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,POULTRY


In [5]:
classes = items['class'].unique()
classes_encoded = pd.get_dummies(classes)
classes_encoded['class'] = classes
classes_encoded.head()

Unnamed: 0,1002,1003,1004,1005,1006,1008,1010,1012,1013,1014,...,6922,6924,6936,6954,6960,7002,7016,7034,7780,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1093
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1067
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3008
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1028
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2712


In [6]:
items_numbers = items['item_nbr'].unique()
items_encoded = pd.get_dummies(items_numbers)
items_encoded['item_nbr'] = items_numbers
items_encoded.head()

Unnamed: 0,96995,99197,103501,103520,103665,105574,105575,105576,105577,105693,...,2131010,2131572,2131699,2132163,2132318,2132945,2132957,2134058,2134244,item_nbr
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,96995
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,99197
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,103501
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,103520
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,103665


### Save encoded master data

In [11]:
families_encoded.to_hdf('./data/families_encoded', 'families_encoded',mode='w', format='table')
classes_encoded.to_hdf('./data/classes_encoded', 'classes_encoded',mode='w', format='table')
items_encoded.to_hdf('./data/item_id_encoded', 'items_id_encoded',mode='w', format='table')

### Join with encoded data and remove redundant category columns

In [8]:
items_with_families = pd.merge(items, families_encoded, on=['family'])
items_with_families_classes = pd.merge(items_with_families, classes_encoded, on=['class'])
items_with_families_classes_items = pd.merge(items_with_families_classes, items_encoded, on=['item_nbr'])

In [10]:
items_with_families_classes_items.drop(['family', 'class'], axis = 1, inplace = True)
items_with_families_classes_items.head()

Unnamed: 0,item_nbr,perishable,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,...,2130553,2131010,2131572,2131699,2132163,2132318,2132945,2132957,2134058,2134244
0,96995,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,479200,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1093340,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1093344,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1467093,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
items_with_families_classes_items.to_hdf('./data/items_encoded', 'items_encoded',mode='w', format='table')