In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import missingno as msno
import calendar

In [2]:
events = pd.read_csv('./data/holidays_events.csv')

In [3]:
#train_with_oil_events = pd.merge(train_with_oil, events, on='date', how='left')
events.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [None]:
def getDayOfWeek(date):
    year, month, day = (int(x) for x in date.split('-'))
    ans = datetime.date(year, month, day)
#     print(ans.weekday(), calendar.day_name[ans.weekday()])
    return ans.weekday()

def getDayNameOfWeek(date):
    year, month, day = (int(x) for x in date.split('-'))
    ans = datetime.date(year, month, day)
#     print(ans.weekday(), calendar.day_name[ans.weekday()])
    return calendar.day_name[ans.weekday()]

getDayOfWeek('1976-09-17')

In [4]:
eventTypes = events['type'].unique()

In [5]:
locales = events['locale'].unique()

In [6]:
events['transferred'].unique()

array([False,  True], dtype=bool)

In [7]:
events['locale_name'].unique()

array(['Manta', 'Cotopaxi', 'Cuenca', 'Libertad', 'Riobamba', 'Puyo',
       'Guaranda', 'Imbabura', 'Latacunga', 'Machala', 'Santo Domingo',
       'El Carmen', 'Cayambe', 'Esmeraldas', 'Ecuador', 'Ambato', 'Ibarra',
       'Quevedo', 'Santo Domingo de los Tsachilas', 'Santa Elena', 'Quito',
       'Loja', 'Salinas', 'Guayaquil'], dtype=object)

In [8]:
#events['dayOfWeek'] = events['date'].map(getDayOfWeek)
# events['dayName'] = events['date'].map(getDayNameOfWeek)
daysOfWeek = pd.DataFrame(range(7))

In [9]:
events.head(10)

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
5,2012-05-12,Holiday,Local,Puyo,Cantonizacion del Puyo,False
6,2012-06-23,Holiday,Local,Guaranda,Cantonizacion de Guaranda,False
7,2012-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala,False


### Looks like we need hot encoding for type, locale, dayOfWeek. 
#### locale_name seems to map to either city or state
#### transferred is already boolean

In [10]:
eventTypes_encoded = pd.get_dummies(eventTypes)
eventTypes_encoded['type'] = eventTypes
eventTypes_encoded.to_hdf('./data/eventTypes_encoded', 'eventTypes_encoded',mode='w', format='table')
eventTypes_encoded.head()

Unnamed: 0,Additional,Bridge,Event,Holiday,Transfer,Work Day,type
0,0,0,0,1,0,0,Holiday
1,0,0,0,0,1,0,Transfer
2,1,0,0,0,0,0,Additional
3,0,1,0,0,0,0,Bridge
4,0,0,0,0,0,1,Work Day


In [11]:
locales_encoded = pd.get_dummies(locales)
locales_encoded['locale'] = locales
locales_encoded.to_hdf('./data/locales_encoded', 'locales_encoded',mode='w', format='table')
locales_encoded.head()

Unnamed: 0,Local,National,Regional,locale
0,1,0,0,Local
1,0,0,1,Regional
2,0,1,0,National


In [12]:
daysOfWeek_encoded = pd.get_dummies(daysOfWeek)
daysOfWeek_encoded['day_of_week'] = daysOfWeek
daysOfWeek_encoded.to_hdf('./data/daysOfWeek_encoded', 'daysOfWeek_encoded',mode='w', format='table')
daysOfWeek_encoded.head(10)

Unnamed: 0,0,day_of_week
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
5,5,5
6,6,6


In [13]:
months = pd.DataFrame(range(12))
month_encoded = pd.get_dummies(months)
month_encoded['month'] = months
month_encoded.to_hdf('./data/month_encoded', 'month_encoded',mode='w', format='table')
month_encoded.head(10)

Unnamed: 0,0,month
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
5,5,5
6,6,6
7,7,7
8,8,8
9,9,9


In [14]:
events_with_type = pd.merge(events, eventTypes_encoded, on=['type'])
events_with_type_locale = pd.merge(events_with_type, locales_encoded, on=['locale'])

events_with_type_locale.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred,Additional,Bridge,Event,Holiday,Transfer,Work Day,Local,National,Regional
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False,0,0,0,1,0,0,1,0,0
1,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False,0,0,0,1,0,0,1,0,0
2,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False,0,0,0,1,0,0,1,0,0
3,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False,0,0,0,1,0,0,1,0,0
4,2012-05-12,Holiday,Local,Puyo,Cantonizacion del Puyo,False,0,0,0,1,0,0,1,0,0


In [15]:
events_with_type_locale.drop(['type', 'locale'], axis = 1, inplace = True)

In [None]:
events_with_type_locale[events_with_type_locale['transferred'] == True]

In [None]:
events_with_type_locale[events_with_type_locale['transferred'] == False]

In [16]:
events_with_type_locale.head()

Unnamed: 0,date,locale_name,description,transferred,Additional,Bridge,Event,Holiday,Transfer,Work Day,Local,National,Regional
0,2012-03-02,Manta,Fundacion de Manta,False,0,0,0,1,0,0,1,0,0
1,2012-04-12,Cuenca,Fundacion de Cuenca,False,0,0,0,1,0,0,1,0,0
2,2012-04-14,Libertad,Cantonizacion de Libertad,False,0,0,0,1,0,0,1,0,0
3,2012-04-21,Riobamba,Cantonizacion de Riobamba,False,0,0,0,1,0,0,1,0,0
4,2012-05-12,Puyo,Cantonizacion del Puyo,False,0,0,0,1,0,0,1,0,0


In [17]:
events_with_type_locale['description'].unique()

array(['Fundacion de Manta', 'Fundacion de Cuenca',
       'Cantonizacion de Libertad', 'Cantonizacion de Riobamba',
       'Cantonizacion del Puyo', 'Cantonizacion de Guaranda',
       'Cantonizacion de Latacunga', 'Fundacion de Machala',
       'Fundacion de Santo Domingo', 'Cantonizacion de El Carmen',
       'Cantonizacion de Cayambe', 'Fundacion de Esmeraldas',
       'Fundacion de Riobamba', 'Fundacion de Ambato',
       'Fundacion de Ibarra', 'Cantonizacion de Quevedo',
       'Independencia de Guaranda', 'Independencia de Latacunga',
       'Independencia de Ambato', 'Fundacion de Quito',
       'Fundacion de Loja', 'Cantonizacion de Salinas',
       'Fundacion de Guayaquil', 'Fundacion de Guayaquil-1',
       'Traslado Fundacion de Guayaquil', 'Traslado Fundacion de Quito',
       'Fundacion de Quito-1', 'Provincializacion de Cotopaxi',
       'Provincializacion de Imbabura',
       'Provincializacion de Santo Domingo',
       'Provincializacion Santa Elena', 'Primer Grito de 

In [30]:
def getEventName(x):
    words = x.split("+")
#     print(len(words), words)
    if len(words) > 1:
        return (words[0], int(words[1]))
    else:
        words = x.split("-")
        if len(words) > 1:
            return (words[0], -1 * int(words[1]))
        else:
            return (words[0], 0)

getEventName('Terremoto Manabi+29')

('Terremoto Manabi', 29)

In [31]:
getEventName('Primer dia del ano-1')

('Primer dia del ano', -1)

In [32]:
getEventName('Independencia de Cuenca')

('Independencia de Cuenca', 0)

In [34]:
def printRow(x):
    print(x)

events_with_type_locale.map(printRow)

AttributeError: 'DataFrame' object has no attribute 'map'