In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# To fix transactions to add stores with no previous values as 0 values
import janitor

In [2]:
test = pd.read_csv('test.csv', parse_dates=['date'])
sample = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv', parse_dates=['date'])
holidays = pd.read_csv('holidays_events.csv', parse_dates=['date'])
oil = pd.read_csv('oil.csv', parse_dates=['date'])
stores = pd.read_csv('stores.csv')
transactions = pd.read_csv('transactions.csv', parse_dates=['date'])

To set up date ranges for the train and test dataframes. 

In [7]:
train_start = train.date.min().date()
train_end = train.date.max().date()
num_train_date = train.date.nunique()
train_len = (train_end - train_start).days + 1
test_start = test.date.min().date()
test_end = test.date.max().date()
num_test_date = test.date.nunique()

To use the janitor function to add 0 values for transactions for stores that don't have any transactions listed on a particular date. This will make it possible to merge transactions into train_test dataframe.

In [3]:
transactions = transactions.complete('date', 'store_nbr').fillna(0, downcast='infer')

Because grocery stores open on weekends as well, there is a need to estimate the missing oil prices if they are to be used in the models. For simplicity, the missing values in the oil column are filled using linear interpolation, where the estimated value is assumed to lie on the line joining the nearest values to the left and right.

In [9]:
oil = oil.merge(
    pd.DataFrame({"date": pd.date_range(train_start, test_end)}),
    on="date",
    how="outer",
).sort_values("date", ignore_index=True)

oil = oil.interpolate(method="pad", limit_direction="forward")

To remove the holidays where Transfer is True, since they are not really actual celebrated holidays

In [11]:
holidays = holidays[holidays.transferred.eq(False)]

To change the oil NaN to 0.

In [13]:
oil = oil.fillna(0)

To merge train and test just for initial data manipulation.

In [15]:
train_test = pd.concat([train, test], ignore_index=True)

To create a day of the week column for the dataframe and convert to numbers. 

In [17]:
train_test['day_of_week'] = train_test['date'].dt.day_name()
mapping = {
    'Sunday': 7,
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday' : 4,
    'Friday' : 5,
    'Saturday' : 6
}
train_test['day_of_week'] = train_test['day_of_week'].replace(mapping)

To create a weekend column to denote whether the day of the week is a weekend or not. 

In [18]:
train_test['weekend'] = pd.Series(0, index=train_test.index).mask(train_test['day_of_week']>5, 1)

To create a month number column. 

In [19]:
train_test['month'] = pd.DatetimeIndex(train_test['date']).month

To add the year and the day and week of year. 

In [20]:
train_test['day_of_year'] = train_test['date'].dt.dayofyear
train_test['week_of_year'] = train_test['date'].dt.isocalendar().week
train_test['year'] = train_test['date'].dt.year

To merge the oil values into the train_test dataframe. For some reason it added 3 additional rows on the bottom, so these were eliminated by looking for those rows where id doesn't exist. Also, for some reason it created NaNs for dates where there were no oil production, so those had to be eliminated. 

In [22]:
train_test = pd.merge(train_test, oil, on='date', how='outer')

In [23]:
train_test = train_test[train_test['id'].notna()]
train_test['dcoilwtico'].fillna(0)

0           0.00
1           0.00
2           0.00
3           0.00
4           0.00
           ...  
3029395    47.26
3029396    47.26
3029397    47.26
3029398    47.26
3029399    47.26
Name: dcoilwtico, Length: 3029400, dtype: float64

For some reason this outer join converted the integer values to floating values, so these needed to be reinstated. 

In [25]:
train_test['id']=train_test['id'].astype(np.int64)
train_test['store_nbr']=train_test['store_nbr'].astype(np.int64)
train_test['onpromotion']=train_test['onpromotion'].astype(np.int64)

To merge the stores dataframe into the train_test dataframe.

In [27]:
train_test = pd.merge(train_test, stores, on='store_nbr', how='outer')

To merge the transactions dataframe into the train_test dataframe on both date and store number since the same transactions need to be show by store by date for all the different product sales for each day and store. 

In [29]:
train_test = pd.merge(train_test, transactions, on=['date', 'store_nbr'], how='outer')

In [30]:
train_test['transactions'].fillna(0, inplace=True)

In planning how to merge the holidays dataframe into the train_test dataframe, the list of unique values was run. Then it appears there are 69 rows where the dates are duplicated. Finally, in preparation for the merge, the locale column name was changed to holiday. 

In [32]:
holidays[holidays.duplicated(['date'], keep=False)]

Unnamed: 0,date,type,locale,locale_name,description,transferred
7,2012-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala,False
10,2012-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False
11,2012-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
...,...,...,...,...,...,...
319,2017-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False
341,2017-12-08,Holiday,Local,Loja,Fundacion de Loja,False
342,2017-12-08,Transfer,Local,Quito,Traslado Fundacion de Quito,False
344,2017-12-22,Holiday,Local,Salinas,Cantonizacion de Salinas,False


In [33]:
holidays = holidays.rename(columns={'locale': 'holiday'})

It might be easier to look at creating separate dataframes for National, Local and Regional holidays to better plan the merge. 

In [34]:
holidays_national = holidays[holidays.holiday == 'National']

In [36]:
holidays_national[holidays_national.duplicated(['date'], keep=False)]

Unnamed: 0,date,type,holiday,locale_name,description,transferred
35,2012-12-24,Bridge,National,Ecuador,Puente Navidad,False
36,2012-12-24,Additional,National,Ecuador,Navidad-1,False
39,2012-12-31,Bridge,National,Ecuador,Puente Primer dia del ano,False
40,2012-12-31,Additional,National,Ecuador,Primer dia del ano-1,False
156,2014-12-26,Bridge,National,Ecuador,Puente Navidad,False
157,2014-12-26,Additional,National,Ecuador,Navidad+1,False
235,2016-05-01,Holiday,National,Ecuador,Dia del Trabajo,False
236,2016-05-01,Event,National,Ecuador,Terremoto Manabi+15,False
242,2016-05-07,Additional,National,Ecuador,Dia de la Madre-1,False
243,2016-05-07,Event,National,Ecuador,Terremoto Manabi+21,False


Since it would be ideal not to add any additional rows in the train_test dataframe as a result of the merge, duplicate dates were removed. 

In [37]:
holidays_national = holidays_national.drop_duplicates(subset=['date'], keep='first')

For further preparation for the merge, type was changed to national_day and all the other columns except date were dropped. 

In [40]:
holidays_national = holidays_national.rename(columns={'type': 'national_day'})

In [41]:
holidays_national['national_day'] = np.where(holidays_national['national_day']=='None', 0, 1)

In [42]:
holidays_national.drop(['holiday', 'locale_name', 'description', 'transferred'], axis=1, inplace=True)

In [44]:
train_test = pd.merge(train_test, holidays_national, on='date', how='outer')

In [46]:
train_test = train_test[train_test['id'].notna()]

To clean up the formatting of the float variables. 

In [49]:
pd.options.mode.chained_assignment = None 

In [50]:
train_test['id']=train_test['id'].astype(np.int64)
train_test['store_nbr']=train_test['store_nbr'].astype(np.int64)
train_test['onpromotion']=train_test['onpromotion'].astype(np.int64)
train_test['cluster']=train_test['cluster'].astype(np.int64)
train_test['day_of_week']=train_test['day_of_week'].astype(np.int64)
train_test['weekend']=train_test['weekend'].astype(np.int64)
train_test['month']=train_test['month'].astype(np.int64)
train_test['day_of_year']=train_test['day_of_year'].astype(np.int64)
train_test['week_of_year']=train_test['week_of_year'].astype(np.int64)
train_test['year']=train_test['year'].astype(np.int64)

Now a new regional holidays dataframe is created. There are no duplicate dates in this dataframe. 

In [52]:
holidays_regional = holidays[holidays.holiday == 'Regional']

In [54]:
holidays_regional = holidays_regional.rename(columns={'type': 'regional_day', 'locale_name': 'state'})

In [55]:
holidays_regional['regional_day'] = np.where(holidays_regional['regional_day']=='None', 0, 1)

In [57]:
holidays_regional.drop(['holiday', 'description', 'transferred'], axis=1, inplace=True)

There appear to be 6 dates that are in the holidays_regional dataframe but not in the train_test dataframe. These rows were then eliminated. 

In [61]:
dates_tt = set(train_test.date.unique())
dates_rh = set(holidays_regional.date.unique())

In [62]:
list(dates_rh-dates_tt)

[numpy.datetime64('2012-04-01T00:00:00.000000000'),
 numpy.datetime64('2012-06-25T00:00:00.000000000'),
 numpy.datetime64('2012-11-06T00:00:00.000000000'),
 numpy.datetime64('2012-11-07T00:00:00.000000000'),
 numpy.datetime64('2017-11-06T00:00:00.000000000'),
 numpy.datetime64('2017-11-07T00:00:00.000000000')]

In [63]:
regional_dates = list(dates_rh-dates_tt)

In [64]:
holidays_regional = holidays_regional[~holidays_regional.date.isin(regional_dates)]

In [347]:
holidays_regional

Unnamed: 0,date,regional_day,state
47,2013-04-01,1,Cotopaxi
58,2013-06-25,1,Imbabura
76,2013-11-06,1,Santo Domingo de los Tsachilas
77,2013-11-07,1,Santa Elena
96,2014-04-01,1,Cotopaxi
112,2014-06-25,1,Imbabura
139,2014-11-06,1,Santo Domingo de los Tsachilas
140,2014-11-07,1,Santa Elena
165,2015-04-01,1,Cotopaxi
177,2015-06-25,1,Imbabura


To merge based on date and then state. 

In [65]:
train_test = pd.merge(train_test, holidays_regional, on=['date', 'state'], how='outer')

The last id number above doesn't match the test dataframe, so train_test was resorted to ensure the last values matched the test dataframe. 

In [71]:
train_test.sort_values('id')

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_of_week,weekend,month,day_of_year,week_of_year,year,dcoilwtico,city,state,type,cluster,transactions,national_day,regional_day
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,2,0,1,1,1,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,
1,1,2013-01-01,1,BABY CARE,0.0,0,2,0,1,1,1,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,
2,2,2013-01-01,1,BEAUTY,0.0,0,2,0,1,1,1,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,
3,3,2013-01-01,1,BEVERAGES,0.0,0,2,0,1,1,1,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,
4,4,2013-01-01,1,BOOKS,0.0,0,2,0,1,1,1,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3028240,3029395,2017-08-31,9,POULTRY,,1,4,0,8,243,35,2017,47.26,Quito,Pichincha,B,6,0.0,,
3028241,3029396,2017-08-31,9,PREPARED FOODS,,0,4,0,8,243,35,2017,47.26,Quito,Pichincha,B,6,0.0,,
3028242,3029397,2017-08-31,9,PRODUCE,,1,4,0,8,243,35,2017,47.26,Quito,Pichincha,B,6,0.0,,
3028243,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,,9,4,0,8,243,35,2017,47.26,Quito,Pichincha,B,6,0.0,,


A new local holidays dataframe is created. 

In [72]:
holidays_local = holidays[holidays.holiday == 'Local']

To explore local holidays with similar dates. All of these have separate cities except for 2016-07-24 for Guayaquil. That could cause a problem in merging into the train_test dataframe since their won't be a unique value each for date and city. So the 'Transfer' locale_day for that date will be deleted. 

In [74]:
holidays_local[holidays_local.duplicated(['date'], keep=False)]

Unnamed: 0,date,type,holiday,locale_name,description,transferred
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala,False
10,2012-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False
11,2012-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
59,2013-06-25,Holiday,Local,Machala,Fundacion de Machala,False
60,2013-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
61,2013-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
62,2013-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False
110,2014-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
111,2014-06-25,Holiday,Local,Machala,Fundacion de Machala,False


In [75]:
holidays_local = holidays_local.drop(holidays_local[(holidays_local['locale_name'] == 'Guayaquil') & 
                                                    (holidays_local['type'] == 'Transfer')].index)

To set up the holidays_local dataframe for merging into train_test. 

In [77]:
holidays_local = holidays_local.rename(columns={'type': 'local_day', 'locale_name': 'city'})

In [78]:
holidays_local['local_day'] = np.where(holidays_local['local_day']=='None', 0, 1)

In [79]:
holidays_local.drop(['holiday', 'description', 'transferred'], axis=1, inplace=True)

In [80]:
holidays_local

Unnamed: 0,date,local_day,city
0,2012-03-02,1,Manta
2,2012-04-12,1,Cuenca
3,2012-04-14,1,Libertad
4,2012-04-21,1,Riobamba
5,2012-05-12,1,Puyo
...,...,...,...
338,2017-11-12,1,Ambato
339,2017-12-05,1,Quito
341,2017-12-08,1,Loja
342,2017-12-08,1,Quito


There are a number of dates in holidays_local that are not in train_test. These rows are eliminated. 

In [81]:
dates_tt = set(train_test.date.unique())
dates_lh = set(holidays_local.date.unique())

In [82]:
local_dates = list(dates_lh-dates_tt)

In [84]:
holidays_local = holidays_local[~holidays_local.date.isin(local_dates)]

To create a new train_test_local dataframe with the merger of the train_test and holidays_local dataframes on city and date. 

In [86]:
train_test = pd.merge(train_test, holidays_local, on=['city', 'date'], how='outer')

To sort the new dataframe to ensure the last id's match the original dataframe id's. 

In [88]:
train_test.sort_values('id')

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_of_week,weekend,month,day_of_year,...,year,dcoilwtico,city,state,type,cluster,transactions,national_day,regional_day,local_day
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,,
1,1,2013-01-01,1,BABY CARE,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,,
2,2,2013-01-01,1,BEAUTY,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,,
3,3,2013-01-01,1,BEVERAGES,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,,
4,4,2013-01-01,1,BOOKS,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3028207,3029395,2017-08-31,9,POULTRY,,1,4,0,8,243,...,2017,47.26,Quito,Pichincha,B,6,0.0,,,
3028208,3029396,2017-08-31,9,PREPARED FOODS,,0,4,0,8,243,...,2017,47.26,Quito,Pichincha,B,6,0.0,,,
3028209,3029397,2017-08-31,9,PRODUCE,,1,4,0,8,243,...,2017,47.26,Quito,Pichincha,B,6,0.0,,,
3028210,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,,9,4,0,8,243,...,2017,47.26,Quito,Pichincha,B,6,0.0,,,


The last thing to do is replace the NaNs in the 3 new holidays columns with 'None'. 

In [89]:
train_test['national_day'].fillna(0, inplace=True)
train_test['regional_day'].fillna(0, inplace=True)
train_test['local_day'].fillna(0, inplace=True)

In [99]:
train_test

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_of_week,weekend,month,day_of_year,...,year,dcoilwtico,city,state,type,cluster,transactions,national_day,regional_day,local_day
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,0.0,0.0
1,1,2013-01-01,1,BABY CARE,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,0.0,0.0
2,2,2013-01-01,1,BEAUTY,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,0.0,0.0
3,3,2013-01-01,1,BEVERAGES,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,0.0,0.0
4,4,2013-01-01,1,BOOKS,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029395,3029263,2017-08-31,54,POULTRY,,0,4,0,8,243,...,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
3029396,3029264,2017-08-31,54,PREPARED FOODS,,0,4,0,8,243,...,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
3029397,3029265,2017-08-31,54,PRODUCE,,1,4,0,8,243,...,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
3029398,3029266,2017-08-31,54,SCHOOL AND OFFICE SUPPLIES,,0,4,0,8,243,...,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0


In [100]:
train_test.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3029400 entries, 0 to 3029399
Data columns (total 21 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            int64         
 1   date          datetime64[ns]
 2   store_nbr     int64         
 3   family        object        
 4   sales         float64       
 5   onpromotion   int64         
 6   day_of_week   int64         
 7   weekend       int64         
 8   month         int64         
 9   day_of_year   int64         
 10  week_of_year  int64         
 11  year          int64         
 12  dcoilwtico    float64       
 13  city          object        
 14  state         object        
 15  type          object        
 16  cluster       int64         
 17  transactions  float64       
 18  national_day  float64       
 19  regional_day  float64       
 20  local_day     float64       
dtypes: datetime64[ns](1), float64(6), int64(10), object(4)
memory usage: 508.5+ MB


In [109]:
train_test_dummies = pd.get_dummies(data=train_test, columns=['store_nbr','cluster','type'], dtype=np.int64)

In [110]:
train_test_dummies.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3029400 entries, 0 to 3029399
Data columns (total 94 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            int64         
 1   date          datetime64[ns]
 2   family        object        
 3   sales         float64       
 4   onpromotion   int64         
 5   day_of_week   int64         
 6   weekend       int64         
 7   month         int64         
 8   day_of_year   int64         
 9   week_of_year  int64         
 10  year          int64         
 11  dcoilwtico    float64       
 12  city          object        
 13  state         object        
 14  transactions  float64       
 15  national_day  float64       
 16  regional_day  float64       
 17  local_day     float64       
 18  store_nbr_1   int64         
 19  store_nbr_2   int64         
 20  store_nbr_3   int64         
 21  store_nbr_4   int64         
 22  store_nbr_5   int64         
 23  store_nbr_6   int64         
 24

To split the train and test dataframes back out. 

In [95]:
train_features = train_test[train_test['sales'].notnull()].copy()
test_features = train_test[train_test['sales'].isnull()].drop(['sales'],axis=1)

In [96]:
train_features

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_of_week,weekend,month,day_of_year,...,year,dcoilwtico,city,state,type,cluster,transactions,national_day,regional_day,local_day
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,0.0,0.0
1,1,2013-01-01,1,BABY CARE,0.000,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,0.0,0.0
2,2,2013-01-01,1,BEAUTY,0.000,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,0.0,0.0
3,3,2013-01-01,1,BEVERAGES,0.000,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,0.0,0.0
4,4,2013-01-01,1,BOOKS,0.000,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,3000751,2017-08-15,54,POULTRY,59.619,0,2,0,8,227,...,2017,47.57,El Carmen,Manabi,C,3,802.0,0.0,0.0,0.0
3000884,3000752,2017-08-15,54,PREPARED FOODS,94.000,0,2,0,8,227,...,2017,47.57,El Carmen,Manabi,C,3,802.0,0.0,0.0,0.0
3000885,3000753,2017-08-15,54,PRODUCE,915.371,76,2,0,8,227,...,2017,47.57,El Carmen,Manabi,C,3,802.0,0.0,0.0,0.0
3000886,3000754,2017-08-15,54,SCHOOL AND OFFICE SUPPLIES,0.000,0,2,0,8,227,...,2017,47.57,El Carmen,Manabi,C,3,802.0,0.0,0.0,0.0


In [97]:
test_features

Unnamed: 0,id,date,store_nbr,family,onpromotion,day_of_week,weekend,month,day_of_year,week_of_year,year,dcoilwtico,city,state,type,cluster,transactions,national_day,regional_day,local_day
3000888,3000888,2017-08-16,1,AUTOMOTIVE,0,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
3000889,3000889,2017-08-16,1,BABY CARE,0,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
3000890,3000890,2017-08-16,1,BEAUTY,2,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
3000891,3000891,2017-08-16,1,BEVERAGES,20,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
3000892,3000892,2017-08-16,1,BOOKS,0,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029395,3029263,2017-08-31,54,POULTRY,0,4,0,8,243,35,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
3029396,3029264,2017-08-31,54,PREPARED FOODS,0,4,0,8,243,35,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
3029397,3029265,2017-08-31,54,PRODUCE,1,4,0,8,243,35,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
3029398,3029266,2017-08-31,54,SCHOOL AND OFFICE SUPPLIES,0,4,0,8,243,35,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0


To save the train_features and test_features as new csv files that can be used in the next stage of the project. 

In [98]:
train_features.to_csv('train_features.csv', index=False)
test_features.to_csv('test_features.csv', index=False)