In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

def create_extra_features(df):
    mean_country_cancelations = df.groupby(by='pais_cliente')['is_reserva_cancelada'].sum() / df.groupby(by='pais_cliente')['is_reserva_cancelada'].count()

    mean_country_cancelations = pd.DataFrame(mean_country_cancelations).rename(columns={'is_reserva_cancelada': 'pct_cancelacion_pais'})
    mean_country_cancelations.reset_index(level=0, inplace=True)

    pct_cancelacion_num_habit = df.groupby('total_habitaciones')['is_reserva_cancelada'].sum() / df.groupby('total_habitaciones')['index'].count()
    pct_cancelacion_num_habit = pct_cancelacion_num_habit.reset_index().rename(columns={0: 'pct_cancelacion_num_habit'})

    # convertimos a datetimes
    date_columns = ['fecha_reserva', 'check-in', 'check-out']
    for d in date_columns:
        df[d] = pd.to_datetime(df[d])

    df['reservation_days_ago'] = (df['check-in'] - df['fecha_reserva']).dt.days
    df['total_noches'] = (df['check-out'] - df['check-in']).dt.days
    df['precio_por_noche'] = df['importe_reserva'] / df['roomnigths']
        
    df['fecha_reserva_weekday'] = df.fecha_reserva.dt.weekday
    df['fecha_reserva_day'] = df.fecha_reserva.dt.day

    df['checkin_weekday'] = df['check-in'].dt.weekday
    df['checkin_month'] = df['check-in'].dt.month
    df['checkin_day'] = df['check-in'].dt.day

    df['checkout_weekday'] = df['check-out'].dt.weekday
    df['checkout_month'] = df['check-out'].dt.month
    df['checkout_day'] = df['check-in'].dt.day

    df = df.merge(mean_country_cancelations, left_on='pais_cliente', right_on='pais_cliente', how='left').fillna(0.0)    
    df = df.merge(pct_cancelacion_num_habit, left_on='total_habitaciones', right_on='total_habitaciones', how='left').fillna(0.0)
    
    return df

def remove_non_usefull_features(df):
    df_cont = df

    # remove categorical variables after extracting other vars
    for categorical_column in ['pais_cliente', 'hotel', 'localizador', 'nombre_cliente', 'telefono_cliente', 'email_cliente']:
        df_cont = df_cont.drop(categorical_column, 1)

    for date_column in ['fecha_reserva', 'check-in', 'check-out']:
        df_cont = df_cont.drop(date_column, 1)
    return df_cont

TRAINING_SET_FILE = 'data/training_set.csv'

df = pd.read_csv(TRAINING_SET_FILE, sep=';')
df = create_extra_features(df)
df = remove_non_usefull_features(df)

In [None]:
y_train = df['is_reserva_cancelada']
X_train = df.drop('is_reserva_cancelada', axis=1)

X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(
    X_train, y_train, test_size=0.33, random_state=42
)

In [None]:
modelo = LogisticRegression(random_state=1)

In [None]:
modelo.fit(X_train_train, y_train_train)
prediccion  =  [x[1] for x in modelo.predict_proba(X_train_test)]
fpr, tpr, thresholds = metrics.roc_curve(y_train_test, prediccion)

In [None]:
print('AUC', metrics.auc(fpr, tpr))
print("Accuracy", metrics.accuracy_score(y_train_test, [1 if y > 0.5 else 0 for y in prediccion]))
plt.plot(fpr,tpr)
plt.show() 