# Feature engineering & selection

In [41]:
import pandas as pd

TRAINING_SET_FILE = 'data/training_set.csv'

df = pd.read_csv(TRAINING_SET_FILE, sep=';')
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27358,27359,27360,27361,27362,27363,27364,27365,27366,27367
index,0,1,2,3,4,5,6,7,8,9,...,4080,4081,4082,4084,4085,4086,4087,4088,4089,4090
fecha_reserva,2017-06-01,2017-06-01,2017-06-01,2017-06-01,2017-06-01,2017-06-01,2017-06-01,2017-06-01,2017-06-01,2017-06-01,...,2017-06-08,2017-06-08,2017-06-08,2017-06-08,2017-06-08,2017-06-08,2017-06-08,2017-06-08,2017-06-08,2017-06-08
pais_cliente,Costa Rica,España,Reino Unido,España,España,República Dominicana,España,España,España,España,...,Argentina,Reino Unido,España,España,Reino Unido,Reino Unido,España,Brasil,Reino Unido,México
importe_reserva,0.00369343,0.00897913,0.00480685,0.00190523,0.00416789,0.0126079,0.00495926,0.0119718,0.00201507,0.00897913,...,0.00442332,0.00789173,0.0121673,0.00152339,0.00896528,0.00219972,0.000934258,0.00710915,0.000617269,0.000949938
roomnigths,3,7,2,2,4,6,4,7,2,7,...,3,2,5,1,5,2,1,7,1,2
check-in,2017-07-21,2017-06-11,2017-06-16,2017-06-11,2017-09-10,2017-06-29,2017-09-10,2017-06-11,2017-06-02,2017-06-11,...,2017-08-08,2017-06-09,2017-08-20,2017-06-10,2017-09-25,2017-07-04,2017-06-10,2017-10-10,2017-07-08,2017-07-07
check-out,2017-07-24,2017-06-18,2017-06-18,2017-06-13,2017-09-14,2017-07-02,2017-09-14,2017-06-18,2017-06-04,2017-06-18,...,2017-08-11,2017-06-11,2017-08-25,2017-06-11,2017-09-30,2017-07-06,2017-06-11,2017-10-17,2017-07-09,2017-07-09
is_reserva_cancelada,1,1,0,0,1,0,1,1,0,1,...,0,1,1,0,1,1,0,0,0,0
total_habitaciones,1,1,1,1,1,2,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
hotel,02bdf29aa2f019efff360097aea1c69e48904c9f,d46b813cd813e2318a35509103b1f081db5f77a3,5e15fec283ff9ceb8449e558d8c5c81ac9fd2cfc,3c0e030d4ab643f402af566b86266cbaf6a610ef,d46b813cd813e2318a35509103b1f081db5f77a3,290d5f265e3662e4df4e4e4a90f1612cc68404f8,28894d8f7f388a274b0bde507e15bb3d4cbbbb3f,d46b813cd813e2318a35509103b1f081db5f77a3,28894d8f7f388a274b0bde507e15bb3d4cbbbb3f,d46b813cd813e2318a35509103b1f081db5f77a3,...,5046d28b88ad50aa40407d6ef7fa30bae007fa15,325a83f4f8eb0ccf0cefff621edcb3bf6c1d8662,90f4adbe1fb02c94b65af630d56f45f99b5cf7d8,d46b813cd813e2318a35509103b1f081db5f77a3,04a192c9bdddee78f753533a1f30a92c18d3cc03,02bdf29aa2f019efff360097aea1c69e48904c9f,c36a7601725e0a08a10eb4d318af5a1f6153fdab,290d5f265e3662e4df4e4e4a90f1612cc68404f8,231da18e12b7782225051a6400c588fcb8d71c40,29051df9be322133daa5fb910521ba535f262e85


In [42]:

# convertimos a datetimes
date_columns = ['fecha_reserva', 'check-in', 'check-out']
for d in date_columns:
    df[d] = pd.to_datetime(df[d])

df['reservation_days_ago'] = (df['check-in'] - df['fecha_reserva']).dt.days
df['total_noches'] = (df['check-out'] - df['check-in']).dt.days
df['precio_por_noche'] = df['importe_reserva'] / df['roomnigths']

groupby_country = df.groupby(by='pais_cliente')['is_reserva_cancelada']
mean_country_cancelations = groupby_country.sum() / groupby_country.count()

mean_country_cancelations = pd.DataFrame(mean_country_cancelations)\
    .rename(columns={'is_reserva_cancelada': 'pct_cancelacion_pais'})
mean_country_cancelations.reset_index(level=0, inplace=True)

df = df.merge(mean_country_cancelations, left_on='pais_cliente', right_on='pais_cliente')

# Feature selection

De todas las características que hemos construido, vamos a evaluar como funcionan.

## Date extractions

In [43]:
df['fecha_reserva_weekday'] = df.fecha_reserva.dt.weekday
df['fecha_reserva_day'] = df.fecha_reserva.dt.day

df['checkin_weekday'] = df['check-in'].dt.weekday
df['checkin_month'] = df['check-in'].dt.month
df['checkin_day'] = df['check-in'].dt.day

df['checkout_weekday'] = df['check-out'].dt.weekday
df['checkout_month'] = df['check-out'].dt.month
df['checkout_day'] = df['check-in'].dt.day


In [44]:
df

Unnamed: 0,index,fecha_reserva,pais_cliente,importe_reserva,roomnigths,check-in,check-out,is_reserva_cancelada,total_habitaciones,hotel,...,precio_por_noche,pct_cancelacion_pais,fecha_reserva_weekday,fecha_reserva_day,checkin_weekday,checkin_month,checkin_day,checkout_weekday,checkout_month,checkout_day
0,0,2017-06-01,Costa Rica,0.003693,3,2017-07-21,2017-07-24,1,1,02bdf29aa2f019efff360097aea1c69e48904c9f,...,0.001231,0.388341,3,1,4,7,21,0,7,21
1,47,2017-06-01,Costa Rica,0.002054,2,2017-07-30,2017-08-01,1,1,5e15fec283ff9ceb8449e558d8c5c81ac9fd2cfc,...,0.001027,0.388341,3,1,6,7,30,1,8,30
2,56,2017-06-01,Costa Rica,0.005170,3,2017-07-07,2017-07-10,1,1,b18bd8f741f5a6a7befac30d9fdcf44eb264087d,...,0.001723,0.388341,3,1,4,7,7,0,7,7
3,60,2017-06-01,Costa Rica,0.002843,2,2017-07-21,2017-07-23,0,1,5e15fec283ff9ceb8449e558d8c5c81ac9fd2cfc,...,0.001422,0.388341,3,1,4,7,21,6,7,21
4,117,2017-06-01,Costa Rica,0.001260,1,2017-06-16,2017-06-17,0,1,ebe4a49ecbe7dffdedc26791929f67d42b4a474e,...,0.001260,0.388341,3,1,4,6,16,5,6,16
5,134,2017-06-01,Costa Rica,0.003746,4,2017-06-11,2017-06-13,0,2,5e15fec283ff9ceb8449e558d8c5c81ac9fd2cfc,...,0.000936,0.388341,3,1,6,6,11,1,6,11
6,138,2017-06-01,Costa Rica,0.001810,2,2017-06-14,2017-06-16,0,1,5e15fec283ff9ceb8449e558d8c5c81ac9fd2cfc,...,0.000905,0.388341,3,1,2,6,14,4,6,14
7,143,2017-06-01,Costa Rica,0.003429,2,2017-07-11,2017-07-13,1,1,b18bd8f741f5a6a7befac30d9fdcf44eb264087d,...,0.001715,0.388341,3,1,1,7,11,3,7,11
8,158,2017-06-01,Costa Rica,0.001921,2,2017-08-20,2017-08-22,0,1,02bdf29aa2f019efff360097aea1c69e48904c9f,...,0.000961,0.388341,3,1,6,8,20,1,8,20
9,220,2017-06-01,Costa Rica,0.002084,2,2017-06-03,2017-06-04,0,2,02bdf29aa2f019efff360097aea1c69e48904c9f,...,0.001042,0.388341,3,1,5,6,3,6,6,3


In [45]:
pd.get_dummies(df.fecha_reserva)

Unnamed: 0,2017-06-01 00:00:00,2017-06-02 00:00:00,2017-06-03 00:00:00,2017-06-04 00:00:00,2017-06-05 00:00:00,2017-06-06 00:00:00,2017-06-07 00:00:00,2017-06-08 00:00:00
0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0
