# Feature engineering & selection

In [None]:
import pandas as pd
TRAINING_SET_FILE = 'data/training_set.csv'

df = pd.read_csv(TRAINING_SET_FILE, sep=';')
df.T


In [None]:

groupby_country = df.groupby(by='pais_cliente')['is_reserva_cancelada']
mean_country_cancelations = groupby_country.sum() / groupby_country.count()

mean_country_cancelations = pd.DataFrame(mean_country_cancelations)\
    .rename(columns={'is_reserva_cancelada': 'pct_cancelacion_pais'})
mean_country_cancelations.reset_index(level=0, inplace=True)


def create_extra_features(df):
    # convertimos a datetimes
    date_columns = ['fecha_reserva', 'check-in', 'check-out']
    for d in date_columns:
        df[d] = pd.to_datetime(df[d])

    df['reservation_days_ago'] = (df['check-in'] - df['fecha_reserva']).dt.days
    df['total_noches'] = (df['check-out'] - df['check-in']).dt.days
    df['precio_por_noche'] = df['importe_reserva'] / df['roomnigths']
        
    df['fecha_reserva_weekday'] = df.fecha_reserva.dt.weekday
    df['fecha_reserva_day'] = df.fecha_reserva.dt.day

    df['checkin_weekday'] = df['check-in'].dt.weekday
    df['checkin_month'] = df['check-in'].dt.month
    df['checkin_day'] = df['check-in'].dt.day

    df['checkout_weekday'] = df['check-out'].dt.weekday
    df['checkout_month'] = df['check-out'].dt.month
    df['checkout_day'] = df['check-in'].dt.day

    df = df.merge(mean_country_cancelations, left_on='pais_cliente', right_on='pais_cliente', how='left')
    return df


df = create_extra_features(df)

ori = df

In [None]:
def remove_non_usefull_features(df):
    df_cont = df

    # remove categorical variables after extracting other vars
    for categorical_column in ['pais_cliente', 'hotel', 'localizador', 'nombre_cliente', 'telefono_cliente', 'email_cliente']:
        df_cont = df_cont.drop(categorical_column, 1)

    # TODO convertir fechas en timestamps
    for date_column in ['fecha_reserva', 'check-in', 'check-out']:
        df_cont = df_cont.drop(date_column, 1)
    return df_cont
        
df_cont = remove_non_usefull_features(df)

#### Porcen Reser vs Cancelacion por pais

In [None]:
res_por_pais = pd.pivot_table(ori,index=["pais_cliente"],values=["index"], aggfunc=lambda x: len(x.unique()))
can_por_pais = pd.pivot_table(ori,index=["pais_cliente"],values=["is_reserva_cancelada"], aggfunc=sum)
result = pd.concat([res_por_pais, can_por_pais], axis=1, join='inner',join_axes=[res_por_pais.index])
result["%"] = result["is_reserva_cancelada"]*100/result["index"]
result = result.sort_values("%", ascending=False)

In [None]:
res_por_fecha_reserva = pd.pivot_table(ori,index=["fecha_reserva"], values=["index"], aggfunc=lambda x: len(x.unique()))
can_por_fecha_reserva = pd.pivot_table(ori,index=["fecha_reserva"],values=["is_reserva_cancelada"], aggfunc=sum)
result_fecha_reserva = pd.concat([res_por_fecha_reserva, can_por_fecha_reserva], axis=1, join='inner',join_axes=[res_por_fecha_reserva.index])
result_fecha_reserva["%"] = result_fecha_reserva["is_reserva_cancelada"]*100/result_fecha_reserva["index"]
result_fecha_reserva.sort_values("%", ascending=False)

# Feature selection

De todas las características que hemos construido, vamos a evaluar como funcionan.

In [None]:
y_train = df_cont['is_reserva_cancelada']
X_train = df_cont.drop('is_reserva_cancelada', axis=1)

# Construct classifier

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import auc

clf = XGBClassifier()
clf.fit(X_train, y_train)

X_test = pd.read_csv('data/test_set.csv', sep=';')

X_test = create_extra_features(X_test)
X_test['pct_cancelacion_pais'] = X_test['pct_cancelacion_pais'].fillna(0.0)
X_test = remove_non_usefull_features(X_test)
#y_test = df_test['is_reserva_cancelada']
#X_test = df_test.drop('is_reserva_cancelada', axis=1)

clf.fit(X_train, y_train)

y_test_prob = clf.predict_proba(X_test)
y_test = clf.predict(X_test)

y_pred = list(map(lambda x:x[1], y_test_prob))

X_test['probabilidad'] = y_pred
X_test[['index', 'probabilidad']].to_csv('tpiza_test.csv', index=False, sep=',')
