In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz
from sklearn.preprocessing import OrdinalEncoder

In [2]:
train_df = pd.read_csv("rossmann-store-sales/train.csv", low_memory=False)
test_df = pd.read_csv("rossmann-store-sales/test.csv", low_memory=False)
store_df = pd.read_csv("rossmann-store-sales/store.csv", low_memory=False)

In [3]:
train_df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [4]:
test_df.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


In [5]:
store_df.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


## Dataset analysis
### Level of data:
* train_df: Store-Date
* test_df: Store-Date
* store_df: Store

In [19]:
print("Number of records in train_df: ", len(train_df))
print("Number of records in test_df: ", len(test_df))
print("Number of records in store_df: ", len(store_df))

print("Number of unique stores: ", train_df['Store'].nunique())
print("Number of unique ids (test set): ", test_df['Id'].nunique())

Number of records in train_df:  1017209
Number of records in test_df:  41088
Number of records in store_df:  1115
Number of unique stores:  1115
Number of unique ids (test set):  41088


## Merge Dataset

In [20]:
train_data = pd.merge(train_df, store_df, how='inner', on='Store')
test_data = pd.merge(test_df, store_df, how='inner', on='Store')
print(len(train_data))
print(len(test_data))

1017209
41088


In [92]:
print(train_data.shape)
print(test_data.shape)

(1017209, 18)
(41088, 17)


In [83]:
my_report = sweetviz.analyze([train_data, "train"], target_feat="Sales")

                                             |      | [  0%]   00:00 -> (? left)

In [84]:
my_report.show_html()

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Testing Entity Embedding model

In [191]:
import Cat2Emb as c2e
from sklearn.model_selection import train_test_split
import imp
imp.reload(c2e)

<module 'Cat2Emb' from '/Users/suvendukumarpati/Documents/Cat2Emb/Cat2Emb.py'>

In [65]:
X_train, y_train =train_data.drop(['Sales'],axis=1), train_data['Sales']
print(X_train.shape)
print(y_train.shape)

(1017209, 17)
(1017209,)


In [67]:
X_train['Store'] = X_train['Store'].astype('object')
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 17 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1017209 non-null  object 
 1   DayOfWeek                  1017209 non-null  int64  
 2   Date                       1017209 non-null  object 
 3   Customers                  1017209 non-null  int64  
 4   Open                       1017209 non-null  int64  
 5   Promo                      1017209 non-null  int64  
 6   StateHoliday               1017209 non-null  object 
 7   SchoolHoliday              1017209 non-null  int64  
 8   StoreType                  1017209 non-null  object 
 9   Assortment                 1017209 non-null  object 
 10  CompetitionDistance        1014567 non-null  float64
 11  CompetitionOpenSinceMonth  693861 non-null   float64
 12  CompetitionOpenSinceYear   693861 non-null   float64
 13  Promo2      

In [195]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41088 entries, 0 to 41087
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Id                         41088 non-null  int64  
 1   Store                      41088 non-null  int64  
 2   DayOfWeek                  41088 non-null  int64  
 3   Date                       41088 non-null  object 
 4   Open                       41077 non-null  float64
 5   Promo                      41088 non-null  int64  
 6   StateHoliday               41088 non-null  object 
 7   SchoolHoliday              41088 non-null  int64  
 8   StoreType                  41088 non-null  object 
 9   Assortment                 41088 non-null  object 
 10  CompetitionDistance        40992 non-null  float64
 11  CompetitionOpenSinceMonth  25872 non-null  float64
 12  CompetitionOpenSinceYear   25872 non-null  float64
 13  Promo2                     41088 non-null  int

In [196]:
var = ['Store', 'Open', 'StoreType']

In [197]:
X_train[var].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   Store      1017209 non-null  object
 1   Open       1017209 non-null  int64 
 2   StoreType  1017209 non-null  object
dtypes: int64(1), object(2)
memory usage: 31.0+ MB


In [198]:
embed_model = c2e.EmbeddingGenerator()
embed_model.fit(X_train[var], y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<Cat2Emb.EmbeddingGenerator at 0x369764400>

In [199]:
embed_model.model.summary()

Model: "model_24"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_Store (InputLayer)    [(None, 1)]                  0         []                            
                                                                                                  
 input_StoreType (InputLaye  [(None, 1)]                  0         []                            
 r)                                                                                               
                                                                                                  
 Store (Embedding)           (None, 1, 50)                55750     ['input_Store[0][0]']         
                                                                                                  
 StoreType (Embedding)       (None, 1, 2)                 8         ['input_StoreType[0][0]

In [200]:
X_train_embeddings = embed_model.transform(X_train[var])
X_train_embeddings.head()

Unnamed: 0,Store,Open,StoreType,Store_0,Store_1,Store_2,Store_3,Store_4,Store_5,Store_6,...,Store_42,Store_43,Store_44,Store_45,Store_46,Store_47,Store_48,Store_49,StoreType_0,StoreType_1
0,1,1,c,-0.348358,-0.399614,-0.410592,-0.442974,0.128933,0.198573,-0.219338,...,-0.106178,0.351475,0.025819,0.424753,-0.40875,0.048423,0.325802,0.269499,0.766109,-0.724569
1,1,1,c,-0.348358,-0.399614,-0.410592,-0.442974,0.128933,0.198573,-0.219338,...,-0.106178,0.351475,0.025819,0.424753,-0.40875,0.048423,0.325802,0.269499,0.766109,-0.724569
2,1,1,c,-0.348358,-0.399614,-0.410592,-0.442974,0.128933,0.198573,-0.219338,...,-0.106178,0.351475,0.025819,0.424753,-0.40875,0.048423,0.325802,0.269499,0.766109,-0.724569
3,1,1,c,-0.348358,-0.399614,-0.410592,-0.442974,0.128933,0.198573,-0.219338,...,-0.106178,0.351475,0.025819,0.424753,-0.40875,0.048423,0.325802,0.269499,0.766109,-0.724569
4,1,1,c,-0.348358,-0.399614,-0.410592,-0.442974,0.128933,0.198573,-0.219338,...,-0.106178,0.351475,0.025819,0.424753,-0.40875,0.048423,0.325802,0.269499,0.766109,-0.724569


In [207]:
test_data['Store'] = test_data['Store'].astype('category')

In [208]:
embed_model.predict(test_data[var])



array([[4746.175  ],
       [4746.175  ],
       [4746.175  ],
       ...,
       [7128.281  ],
       [ -21.17331],
       [7128.281  ]], dtype=float32)

In [209]:
X_test_embeddings = embed_model.transform(test_data[var])

In [210]:
X_test_embeddings

Unnamed: 0,Store,Open,StoreType,Store_0,Store_1,Store_2,Store_3,Store_4,Store_5,Store_6,...,Store_42,Store_43,Store_44,Store_45,Store_46,Store_47,Store_48,Store_49,StoreType_0,StoreType_1
0,1,1.0,c,-0.348358,-0.399614,-0.410592,-0.442974,0.128933,0.198573,-0.219338,...,-0.106178,0.351475,0.025819,0.424753,-0.408750,0.048423,0.325802,0.269499,0.766109,-0.724569
1,1,1.0,c,-0.348358,-0.399614,-0.410592,-0.442974,0.128933,0.198573,-0.219338,...,-0.106178,0.351475,0.025819,0.424753,-0.408750,0.048423,0.325802,0.269499,0.766109,-0.724569
2,1,1.0,c,-0.348358,-0.399614,-0.410592,-0.442974,0.128933,0.198573,-0.219338,...,-0.106178,0.351475,0.025819,0.424753,-0.408750,0.048423,0.325802,0.269499,0.766109,-0.724569
3,1,1.0,c,-0.348358,-0.399614,-0.410592,-0.442974,0.128933,0.198573,-0.219338,...,-0.106178,0.351475,0.025819,0.424753,-0.408750,0.048423,0.325802,0.269499,0.766109,-0.724569
4,1,0.0,c,-0.348358,-0.399614,-0.410592,-0.442974,0.128933,0.198573,-0.219338,...,-0.106178,0.351475,0.025819,0.424753,-0.408750,0.048423,0.325802,0.269499,0.766109,-0.724569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41083,1097,1.0,b,-0.411707,0.335884,0.351457,0.393117,-0.122089,0.458850,-0.012583,...,-0.076637,-0.336875,0.060815,-0.333384,0.333252,-0.008790,-0.367509,-0.029743,1.396022,-1.439238
41084,1097,1.0,b,-0.411707,0.335884,0.351457,0.393117,-0.122089,0.458850,-0.012583,...,-0.076637,-0.336875,0.060815,-0.333384,0.333252,-0.008790,-0.367509,-0.029743,1.396022,-1.439238
41085,1097,1.0,b,-0.411707,0.335884,0.351457,0.393117,-0.122089,0.458850,-0.012583,...,-0.076637,-0.336875,0.060815,-0.333384,0.333252,-0.008790,-0.367509,-0.029743,1.396022,-1.439238
41086,1097,1.0,b,-0.411707,0.335884,0.351457,0.393117,-0.122089,0.458850,-0.012583,...,-0.076637,-0.336875,0.060815,-0.333384,0.333252,-0.008790,-0.367509,-0.029743,1.396022,-1.439238


In [204]:
embed_model.predict(X_train[var])



array([[ 4.7461748e+03],
       [ 4.7461748e+03],
       [ 4.7461748e+03],
       ...,
       [ 6.3444580e+03],
       [ 6.3444580e+03],
       [-2.9230194e+00]], dtype=float32)

In [205]:
embed_model.predict(test_data[var])

ValueError: in user code:

    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/src/engine/training.py", line 2440, in predict_function  *
        return step_function(self, iterator)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/src/engine/training.py", line 2425, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/src/engine/training.py", line 2413, in run_step  **
        outputs = model.predict_step(data)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/src/engine/training.py", line 2381, in predict_step
        return self(x, training=False)
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/keras/src/engine/input_spec.py", line 219, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model_24" expects 3 input(s), but it received 2 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(32, 2) dtype=float32>, <tf.Tensor 'IteratorGetNext:1' shape=(32,) dtype=int32>]


In [206]:
test_data[var]

Unnamed: 0,Store,Open,StoreType
0,1,1.0,c
1,1,1.0,c
2,1,1.0,c
3,1,1.0,c
4,1,0.0,c
...,...,...,...
41083,1115,1.0,d
41084,1115,1.0,d
41085,1115,1.0,d
41086,1115,0.0,d
