In [None]:
import pandas as pd
TRAINING_SET_FILE = 'data/training_set.csv'

df = pd.read_csv(TRAINING_SET_FILE, sep=';')
df.T

In [None]:
mean_country_cancelations = df.groupby(by='pais_cliente')['is_reserva_cancelada'].sum() / df.groupby(by='pais_cliente')['is_reserva_cancelada'].count()

mean_country_cancelations = pd.DataFrame(mean_country_cancelations).rename(columns={'is_reserva_cancelada': 'pct_cancelacion_pais'})
mean_country_cancelations.reset_index(level=0, inplace=True)

pct_cancelacion_num_habit = df.groupby('total_habitaciones')['is_reserva_cancelada'].sum() / df.groupby('total_habitaciones')['index'].count()
pct_cancelacion_num_habit = pct_cancelacion_num_habit.reset_index().rename(columns={0: 'pct_cancelacion_num_habit'})

def create_extra_features(df):
    # convertimos a datetimes
    date_columns = ['fecha_reserva', 'check-in', 'check-out']
    for d in date_columns:
        df[d] = pd.to_datetime(df[d])

    df['reservation_days_ago'] = (df['check-in'] - df['fecha_reserva']).dt.days
    df['total_noches'] = (df['check-out'] - df['check-in']).dt.days
    df['precio_por_noche'] = df['importe_reserva'] / df['roomnigths']
        
    df['fecha_reserva_weekday'] = df.fecha_reserva.dt.weekday
    df['fecha_reserva_day'] = df.fecha_reserva.dt.day

    df['checkin_weekday'] = df['check-in'].dt.weekday
    df['checkin_month'] = df['check-in'].dt.month
    df['checkin_day'] = df['check-in'].dt.day

    df['checkout_weekday'] = df['check-out'].dt.weekday
    df['checkout_month'] = df['check-out'].dt.month
    df['checkout_day'] = df['check-in'].dt.day

    df = df.merge(mean_country_cancelations, left_on='pais_cliente', right_on='pais_cliente', how='left').fillna(0.0)    
    df = df.merge(pct_cancelacion_num_habit, left_on='total_habitaciones', right_on='total_habitaciones', how='left').fillna(0.0)
    
    return df

df = create_extra_features(df)

In [None]:
def remove_non_usefull_features(df):
    df_cont = df

    # remove categorical variables after extracting other vars
    for categorical_column in ['pais_cliente', 'hotel', 'localizador', 'nombre_cliente', 'telefono_cliente', 'email_cliente']:
        df_cont = df_cont.drop(categorical_column, 1)

    for date_column in ['fecha_reserva', 'check-in', 'check-out']:
        df_cont = df_cont.drop(date_column, 1)
    return df_cont
        
df_cont = remove_non_usefull_features(df)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
import numpy
# fix random seed for reproducibility
numpy.random.seed(7)
y = numpy.array(df_cont['is_reserva_cancelada'])
X = numpy.array(df_cont.drop(['is_reserva_cancelada', "index"], axis=1))

In [None]:
# create model
model = Sequential()
model.add(Dense(16, input_dim=16))
model.add(Dense(128))
model.add(Dense(256))
model.add(Dense(512))
model.add(Dense(1024))
model.add(Dense(512))
model.add(Dense(256))
model.add(Dense(128))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.summary()

In [None]:
# Fit the model
model.fit(X, y, epochs=1000, batch_size=32)

In [None]:
# evaluate the model
scores = model.evaluate(X, y)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))