# Feature engineering & selection

In [None]:
import pandas as pd
TRAINING_SET_FILE = 'data/training_set.csv'

df = pd.read_csv(TRAINING_SET_FILE, sep=';')
df.T

In [None]:

groupby_country = df.groupby(by='pais_cliente')['is_reserva_cancelada']
mean_country_cancelations = groupby_country.sum() / groupby_country.count()

mean_country_cancelations = pd.DataFrame(mean_country_cancelations)\
    .rename(columns={'is_reserva_cancelada': 'pct_cancelacion_pais'})
mean_country_cancelations.reset_index(level=0, inplace=True)

res_por_fecha_reserva = pd.pivot_table(df,index=["fecha_reserva"], values=["index"], aggfunc=lambda x: len(x.unique()))
can_por_fecha_reserva = pd.pivot_table(df,index=["fecha_reserva"],values=["is_reserva_cancelada"], aggfunc=sum)
result_fecha_reserva = pd.concat([res_por_fecha_reserva, can_por_fecha_reserva], axis=1, join='inner',join_axes=[res_por_fecha_reserva.index])
result_fecha_reserva["%"] = result_fecha_reserva["is_reserva_cancelada"]/result_fecha_reserva["index"]
#result_fecha_reserva.sort_values("%", ascending=False)
result_fecha_reserva.reset_index(level=0, inplace=True)
result_fecha_reserva = result_fecha_reserva.rename(columns={'%':'pct_cancelaciones_fecha_reserva'})
result_fecha_reserva = result_fecha_reserva[['fecha_reserva', 'pct_cancelaciones_fecha_reserva']]

pct_cancelacion_num_habit = df.groupby('total_habitaciones')['is_reserva_cancelada'].sum() / df.groupby('total_habitaciones')['index'].count()
pct_cancelacion_num_habit = pct_cancelacion_num_habit.reset_index().rename(columns={0: 'pct_cancelacion_num_habit'})

def create_extra_features(df):
    # convertimos a datetimes
    date_columns = ['fecha_reserva', 'check-in', 'check-out']
    for d in date_columns:
        df[d] = pd.to_datetime(df[d])

    df['reservation_days_ago'] = (df['check-in'] - df['fecha_reserva']).dt.days
    df['total_noches'] = (df['check-out'] - df['check-in']).dt.days
    df['precio_por_noche'] = df['importe_reserva'] / df['roomnigths']
        
    df['fecha_reserva_weekday'] = df.fecha_reserva.dt.weekday
    df['fecha_reserva_day'] = df.fecha_reserva.dt.day

    df['checkin_weekday'] = df['check-in'].dt.weekday
    df['checkin_month'] = df['check-in'].dt.month
    df['checkin_day'] = df['check-in'].dt.day

    df['checkout_weekday'] = df['check-out'].dt.weekday
    df['checkout_month'] = df['check-out'].dt.month
    df['checkout_day'] = df['check-in'].dt.day

    df = df.merge(mean_country_cancelations, left_on='pais_cliente', right_on='pais_cliente', how='left').fillna(0.0)    
    df = df.merge(pct_cancelacion_num_habit, left_on='total_habitaciones', right_on='total_habitaciones', how='left').fillna(0.0)
    
    return df

df = create_extra_features(df)

In [None]:
def remove_non_usefull_features(df):
    df_cont = df

    # remove categorical variables after extracting other vars
    for categorical_column in ['pais_cliente', 'hotel', 'localizador', 'nombre_cliente', 'telefono_cliente', 'email_cliente']:
        df_cont = df_cont.drop(categorical_column, 1)

    # TODO convertir fechas en timestamps
    for date_column in ['fecha_reserva', 'check-in', 'check-out']:
        df_cont = df_cont.drop(date_column, 1)
    return df_cont
        
df_cont = remove_non_usefull_features(df)

# Feature selection

De todas las características que hemos construido, vamos a evaluar como funcionan.

In [None]:
y_train = df_cont['is_reserva_cancelada']
X_train = df_cont.drop('is_reserva_cancelada', axis=1)


    
from imblearn.over_sampling import SMOTE


# Construct classifier

In [None]:
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split


clf = XGBClassifier(n_estimators=500)

X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(
    X_train, y_train, test_size=0.33, random_state=42
)

X_train_resampled, y_train_resampled = SMOTE().fit_sample(X_train_train, y_train_train)

clf.fit(X_train_train, y_train_train)
pred = [x[1] for x in clf.predict_proba(X_train_test)]

fpr, tpr, thresholds = metrics.roc_curve(y_train_test, pred)
print('AUC', metrics.auc(fpr, tpr))
print("Accuracy", metrics.accuracy_score(y_train_test, [1 if y > 0.5 else 0 for y in pred]))

In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier(n_estimators=500)
clf.fit(X_train, y_train)

X_test = pd.read_csv('data/eval_set.csv', sep=';')

print(X_test.shape)
X_test = create_extra_features(X_test)
print(X_test.shape)
#X_test['pct_cancelacion_pais'] = X_test['pct_cancelacion_pais'].fillna(0.0)
X_test = remove_non_usefull_features(X_test)
#y_test = df_test['is_reserva_cancelada']
#X_test = df_test.drop('is_reserva_cancelada', axis=1)

clf.fit(X_train, y_train)

y_test_prob = clf.predict_proba(X_test)
y_test = clf.predict(X_test)

y_pred = list(map(lambda x:x[1], y_test_prob))

X_test['probabilidad'] = y_pred
X_test[['index', 'probabilidad']].to_csv('tpiza_eval.csv', index=False, sep=',')
