# Load dataframes

In [1]:
import pandas as pd
import pickle

pd.set_option('display.max_columns', None) # no truncate columns

## Load data

In [2]:
# Environment settings
data_path_in = 'Data/input/'
data_path_out = 'Data/output/'

data_type = {
    'StoreID': 'uint16',
    'Date': str,
    'IsHoliday': bool,
    'IsOpen': bool,
    'HasPromotions': bool,
    'StoreType': 'category',
    'AssortmentType': 'category',
    'NearestCompetitor': 'uint32',
    'Region': 'category',
    'NumberOfCustomers': 'uint16',
    'NumberOfSales': 'uint16',
    'Region_AreaKM2': 'uint16',
    'Region_GDP': 'uint16',
    'Region_PopulationK': 'uint16',
    'CloudCover': 'float32',
    'Events': 'category',
    'Max_Dew_PointC': 'int8',
    'Max_Gust_SpeedKm_h': 'float32', # 'uint8'
    'Max_Humidity': 'uint8',
    'Max_Sea_Level_PressurehPa': 'uint16',
    'Max_TemperatureC': 'int8',
    'Max_VisibilityKm': 'float32', # 'uint8'
    'Max_Wind_SpeedKm_h': 'uint8',
    'Mean_Dew_PointC': 'int8',
    'Mean_Humidity': 'uint8',
    'Mean_Sea_Level_PressurehPa': 'uint16',
    'Mean_TemperatureC': 'int8',
    'Mean_VisibilityKm': 'float32', # 'uint8'
    'Mean_Wind_SpeedKm_h': 'uint8',
    'Min_Dew_PointC': 'int8',
    'Min_Humidity': 'uint8',
    'Min_Sea_Level_PressurehPa': 'uint16',
    'Min_TemperatureC': 'int8',
    'Min_VisibilitykM': 'float32', #'uint8'
    'Precipitationmm': 'float32',
    'WindDirDegrees': 'int16'
}

# load dataframes
df = {}
for ds_name in ['train', 'test']:
    df[ds_name] = pd.read_csv(data_path_in+ds_name+'.csv',
                              dtype=data_type, parse_dates=["Date"], dayfirst=True)

In [3]:
# combine train & test data
df['all'] = pd.concat([
    df['train'].drop(columns=['NumberOfCustomers', 'NumberOfSales'], axis=1),
    df['test']
]).reset_index()

# restore original column order
df['all'] = df['all'][list(df['test'].columns.values)];
df['all'].Events = df['all'].Events.astype('category')

In [4]:
# Serialize dataframes for later use in preprocessing notebook
for d_name, d in df.items():
    with open(data_path_out+d_name+'_dv.obj', 'wb') as file:
        pickle.dump(d, file)

## Show data

In [5]:
# Dataframes shape
for d_name, d in df.items():
    print('{} set has shape {}\n'.format(d_name, d.shape))

train set has shape (523021, 36)

test set has shape (45689, 34)

all set has shape (568710, 34)



In [6]:
# Show Dataframes
from IPython.display import display
for d_name, d in df.items():
    print('{} dataframe:\n'.format(d_name))
    display(d.head())

train dataframe:



Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,StoreType,AssortmentType,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,Region_GDP,Region_PopulationK,CloudCover,Events,Max_Dew_PointC,Max_Gust_SpeedKm_h,Max_Humidity,Max_Sea_Level_PressurehPa,Max_TemperatureC,Max_VisibilityKm,Max_Wind_SpeedKm_h,Mean_Dew_PointC,Mean_Humidity,Mean_Sea_Level_PressurehPa,Mean_TemperatureC,Mean_VisibilityKm,Mean_Wind_SpeedKm_h,Min_Dew_PointC,Min_Humidity,Min_Sea_Level_PressurehPa,Min_TemperatureC,Min_VisibilitykM,Precipitationmm,WindDirDegrees
0,1000,2016-03-01,False,True,False,Hyper Market,General,326,7,495,5676,9643,17130,2770,8.0,Rain-Snow,1,,100,1032,2,19.0,21,-1,82,1030,1,11.0,16,-2,70,1029,1,6.0,0.0,23
1,1000,2016-03-02,False,True,False,Hyper Market,General,326,7,608,8111,9643,17130,2770,8.0,Snow,0,,87,1030,5,23.0,16,-1,73,1027,3,13.0,10,-2,58,1025,1,10.0,0.0,56
2,1000,2016-03-04,False,True,False,Hyper Market,General,326,7,665,8300,9643,17130,2770,8.0,Rain,0,,81,1026,4,31.0,23,-1,71,1024,3,11.0,10,-3,55,1023,2,8.0,0.0,22
3,1000,2016-03-05,False,True,False,Hyper Market,General,326,7,630,7154,9643,17130,2770,6.0,,-3,,80,1027,8,31.0,19,-4,56,1024,3,15.0,10,-6,25,1022,-1,10.0,0.0,108
4,1000,2016-03-06,False,False,False,Hyper Market,General,326,7,0,0,9643,17130,2770,6.0,,0,,93,1025,7,31.0,16,-3,75,1023,1,12.0,5,-6,48,1022,-5,5.0,0.0,46


test dataframe:



Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,StoreType,AssortmentType,NearestCompetitor,Region,Region_AreaKM2,Region_GDP,Region_PopulationK,CloudCover,Events,Max_Dew_PointC,Max_Gust_SpeedKm_h,Max_Humidity,Max_Sea_Level_PressurehPa,Max_TemperatureC,Max_VisibilityKm,Max_Wind_SpeedKm_h,Mean_Dew_PointC,Mean_Humidity,Mean_Sea_Level_PressurehPa,Mean_TemperatureC,Mean_VisibilityKm,Mean_Wind_SpeedKm_h,Min_Dew_PointC,Min_Humidity,Min_Sea_Level_PressurehPa,Min_TemperatureC,Min_VisibilitykM,Precipitationmm,WindDirDegrees
0,1000,2018-03-01,False,True,False,Hyper Market,General,326,7,9643,17130,2770,6.0,Rain,3,,95,1022,9,31.0,18,2,75,1019,6,14.0,5,-1,46,1011,2,10.0,0.0,180
1,1000,2018-03-02,False,True,False,Hyper Market,General,326,7,9643,17130,2770,7.0,Rain,4,,99,1021,5,31.0,19,2,83,1013,4,12.0,6,-1,52,1009,3,10.0,5.08,315
2,1000,2018-03-03,False,True,False,Hyper Market,General,326,7,9643,17130,2770,2.0,Fog-Rain,2,,100,1023,8,31.0,18,-1,74,1020,3,13.0,11,-3,41,1013,-2,10.0,0.0,210
3,1000,2018-03-04,False,False,False,Hyper Market,General,326,7,9643,17130,2770,6.0,Rain,7,,97,1014,10,31.0,29,4,83,1007,6,11.0,18,-2,65,1002,1,6.0,3.05,193
4,1000,2018-03-05,False,True,True,Hyper Market,General,326,7,9643,17130,2770,4.0,Rain-Snow,7,61.0,87,1018,10,26.0,42,1,65,1007,6,10.0,23,-5,25,1000,2,8.0,0.25,247


all dataframe:



Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,StoreType,AssortmentType,NearestCompetitor,Region,Region_AreaKM2,Region_GDP,Region_PopulationK,CloudCover,Events,Max_Dew_PointC,Max_Gust_SpeedKm_h,Max_Humidity,Max_Sea_Level_PressurehPa,Max_TemperatureC,Max_VisibilityKm,Max_Wind_SpeedKm_h,Mean_Dew_PointC,Mean_Humidity,Mean_Sea_Level_PressurehPa,Mean_TemperatureC,Mean_VisibilityKm,Mean_Wind_SpeedKm_h,Min_Dew_PointC,Min_Humidity,Min_Sea_Level_PressurehPa,Min_TemperatureC,Min_VisibilitykM,Precipitationmm,WindDirDegrees
0,1000,2016-03-01,False,True,False,Hyper Market,General,326,7,9643,17130,2770,8.0,Rain-Snow,1,,100,1032,2,19.0,21,-1,82,1030,1,11.0,16,-2,70,1029,1,6.0,0.0,23
1,1000,2016-03-02,False,True,False,Hyper Market,General,326,7,9643,17130,2770,8.0,Snow,0,,87,1030,5,23.0,16,-1,73,1027,3,13.0,10,-2,58,1025,1,10.0,0.0,56
2,1000,2016-03-04,False,True,False,Hyper Market,General,326,7,9643,17130,2770,8.0,Rain,0,,81,1026,4,31.0,23,-1,71,1024,3,11.0,10,-3,55,1023,2,8.0,0.0,22
3,1000,2016-03-05,False,True,False,Hyper Market,General,326,7,9643,17130,2770,6.0,,-3,,80,1027,8,31.0,19,-4,56,1024,3,15.0,10,-6,25,1022,-1,10.0,0.0,108
4,1000,2016-03-06,False,False,False,Hyper Market,General,326,7,9643,17130,2770,6.0,,0,,93,1025,7,31.0,16,-3,75,1023,1,12.0,5,-6,48,1022,-5,5.0,0.0,46
