In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load stores data

In [2]:
stores = pd.read_csv('./data/stores.csv')
stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


### Get unique data for one hot encoding

In [3]:
stores_ids = stores['store_nbr']
cities = stores['city'].unique()
states = stores['state'].unique()
types = stores['type'].unique()
clusters = stores['cluster'].unique()

In [4]:
stores_ids

0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
11    12
12    13
13    14
14    15
15    16
16    17
17    18
18    19
19    20
20    21
21    22
22    23
23    24
24    25
25    26
26    27
27    28
28    29
29    30
30    31
31    32
32    33
33    34
34    35
35    36
36    37
37    38
38    39
39    40
40    41
41    42
42    43
43    44
44    45
45    46
46    47
47    48
48    49
49    50
50    51
51    52
52    53
53    54
Name: store_nbr, dtype: int64

In [5]:
cities

array(['Quito', 'Santo Domingo', 'Cayambe', 'Latacunga', 'Riobamba',
       'Ibarra', 'Guaranda', 'Puyo', 'Ambato', 'Guayaquil', 'Salinas',
       'Daule', 'Babahoyo', 'Quevedo', 'Playas', 'Libertad', 'Cuenca',
       'Loja', 'Machala', 'Esmeraldas', 'Manta', 'El Carmen'], dtype=object)

In [6]:
states

array(['Pichincha', 'Santo Domingo de los Tsachilas', 'Cotopaxi',
       'Chimborazo', 'Imbabura', 'Bolivar', 'Pastaza', 'Tungurahua',
       'Guayas', 'Santa Elena', 'Los Rios', 'Azuay', 'Loja', 'El Oro',
       'Esmeraldas', 'Manabi'], dtype=object)

In [7]:
types

array(['D', 'B', 'C', 'E', 'A'], dtype=object)

In [8]:
clusters

array([13,  8,  9,  4,  6, 15,  7,  3, 12, 16,  1, 10,  2,  5, 11, 14, 17])

### Encode Store numbers

In [9]:
stores_hot_encoded = pd.get_dummies(stores_ids)
stores_hot_encoded['store_nbr'] = stores_ids
stores_hot_encoded.to_hdf('./data/stores_encoded', 'stores_encoded',mode='w', format='table')
stores_hot_encoded.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,46,47,48,49,50,51,52,53,54,store_nbr
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


### Encode states and add state key for joining

In [10]:
states_hot_encoded = pd.get_dummies(states)
states_hot_encoded['state'] = states
states_hot_encoded.to_hdf('./data/states_encoded', 'states_encoded',mode='w', format='table')
states_hot_encoded.head()

Unnamed: 0,Azuay,Bolivar,Chimborazo,Cotopaxi,El Oro,Esmeraldas,Guayas,Imbabura,Loja,Los Rios,Manabi,Pastaza,Pichincha,Santa Elena,Santo Domingo de los Tsachilas,Tungurahua,state
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,Pichincha
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Santo Domingo de los Tsachilas
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,Cotopaxi
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,Chimborazo
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,Imbabura


### Encode cities and add city key for joining

In [11]:
cities_hot_encoded = pd.get_dummies(cities)
cities_hot_encoded['city']= cities
cities_hot_encoded.to_hdf('./data/cities_encoded', 'cities_encoded',mode='w', format='table')
cities_hot_encoded.head()

Unnamed: 0,Ambato,Babahoyo,Cayambe,Cuenca,Daule,El Carmen,Esmeraldas,Guaranda,Guayaquil,Ibarra,...,Machala,Manta,Playas,Puyo,Quevedo,Quito,Riobamba,Salinas,Santo Domingo,city
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,Quito
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Santo Domingo
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Cayambe
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Latacunga
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,Riobamba


### Encode store types and add type key for joining

In [12]:
types_hot_encoded = pd.get_dummies(types)
types_hot_encoded['type'] = types
types_hot_encoded.to_hdf('./data/types_encoded', 'types_encoded',mode='w', format='table')
types_hot_encoded.head()

Unnamed: 0,A,B,C,D,E,type
0,0,0,0,1,0,D
1,0,1,0,0,0,B
2,0,0,1,0,0,C
3,0,0,0,0,1,E
4,1,0,0,0,0,A


### Encode clusters and add cluster key for joining 

In [13]:
clusters_hot_encoded = pd.get_dummies(clusters)
clusters_hot_encoded['cluster'] = clusters
clusters_hot_encoded.to_hdf('./data/clusters_encoded', 'clusters_encoded',mode='w', format='table')
clusters_hot_encoded.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,cluster
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,13
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,8
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,9
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,4
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,6


In [14]:
stores.shape

(54, 5)

### Join stores with encoded types

In [15]:
stores_with_type = pd.merge(stores, types_hot_encoded, on=['type'])

In [16]:
stores_with_type.shape

(54, 10)

### Join result with encoded clusters

In [17]:
stores_with_type_cluster = pd.merge(stores_with_type, clusters_hot_encoded, on=['cluster'])

In [18]:
stores_with_type_cluster.shape

(54, 27)

### Join result with encoded states

In [19]:
stores_with_type_cluster_state = pd.merge(stores_with_type_cluster, states_hot_encoded, on=['state'])

In [20]:
stores_with_type_cluster_state.shape

(54, 43)

### Join result with encoded cities

In [21]:
stores_with_type_cluster_state_city = pd.merge(stores_with_type_cluster_state, cities_hot_encoded, on=['city'])

In [22]:
stores_with_type_cluster_state_city.shape

(54, 65)

In [23]:
stores_with_type_cluster_state_city.head()

Unnamed: 0,store_nbr,city,state,type,cluster,A,B,C,D,E,...,Loja_y,Machala,Manta,Playas,Puyo,Quevedo,Quito,Riobamba,Salinas,Santo Domingo
0,1,Quito,Pichincha,D,13,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,2,Quito,Pichincha,D,13,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,6,Quito,Pichincha,D,13,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,3,Quito,Pichincha,D,8,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,7,Quito,Pichincha,D,8,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


### Remove category columns

In [24]:
stores_with_type_cluster_state_city.drop(['city', 'state', 'type', 'cluster'], axis = 1, inplace = True)
stores_with_type_cluster_state_city_stores = pd.merge(stores_with_type_cluster_state_city, stores_hot_encoded, on=['store_nbr'])
stores_with_type_cluster_state_city_stores.head()

Unnamed: 0,store_nbr,A,B,C,D,E,1_x,2_x,3_x,4_x,...,45,46,47,48,49,50,51,52,53,54
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Write encoded store data to file

In [25]:
stores_with_type_cluster_state_city_stores.to_hdf('./data/stores_encoded', 'stores_encoded',mode='w', format='table')

In [29]:
stores_with_type_cluster_state_city_stores.dtypes

store_nbr       int64
A               uint8
B               uint8
C               uint8
D               uint8
E               uint8
1_x             uint8
2_x             uint8
3_x             uint8
4_x             uint8
5_x             uint8
6_x             uint8
7_x             uint8
8_x             uint8
9_x             uint8
10_x            uint8
11_x            uint8
12_x            uint8
13_x            uint8
14_x            uint8
15_x            uint8
16_x            uint8
17_x            uint8
Azuay           uint8
Bolivar         uint8
Chimborazo      uint8
Cotopaxi        uint8
El Oro          uint8
Esmeraldas_x    uint8
Guayas          uint8
                ...  
25              uint8
26              uint8
27              uint8
28              uint8
29              uint8
30              uint8
31              uint8
32              uint8
33              uint8
34              uint8
35              uint8
36              uint8
37              uint8
38              uint8
39        