# Feature engineering & selection

In [None]:
import pandas as pd
TRAINING_SET_FILE = 'data/training_set.csv'

df = pd.read_csv(TRAINING_SET_FILE, sep=';')
df.T


In [None]:

groupby_country = df.groupby(by='pais_cliente')['is_reserva_cancelada']
mean_country_cancelations = groupby_country.sum() / groupby_country.count()

mean_country_cancelations = pd.DataFrame(mean_country_cancelations)\
    .rename(columns={'is_reserva_cancelada': 'pct_cancelacion_pais'})
mean_country_cancelations.reset_index(level=0, inplace=True)


def create_extra_features(df):
    # convertimos a datetimes
    date_columns = ['fecha_reserva', 'check-in', 'check-out']
    for d in date_columns:
        df[d] = pd.to_datetime(df[d])

    df['reservation_days_ago'] = (df['check-in'] - df['fecha_reserva']).dt.days
    df['total_noches'] = (df['check-out'] - df['check-in']).dt.days
    df['precio_por_noche'] = df['importe_reserva'] / df['roomnigths']

    df = df.merge(mean_country_cancelations, left_on='pais_cliente', right_on='pais_cliente')
    return df


df = create_extra_features(df)
df['fecha_reserva_weekday'] = df.fecha_reserva.dt.weekday
df['fecha_reserva_day'] = df.fecha_reserva.dt.day

df['checkin_weekday'] = df['check-in'].dt.weekday
df['checkin_month'] = df['check-in'].dt.month
df['checkin_day'] = df['check-in'].dt.day

df['checkout_weekday'] = df['check-out'].dt.weekday
df['checkout_month'] = df['check-out'].dt.month
df['checkout_day'] = df['check-in'].dt.day



ori = df

In [None]:
def remove_non_usefull_features(df):
    df_cont = df

    # remove categorical variables after extracting other vars
    for categorical_column in ['pais_cliente', 'hotel', 'localizador', 'nombre_cliente', 'telefono_cliente', 'email_cliente']:
        df_cont = df_cont.drop(categorical_column, 1)

    # TODO convertir fechas en timestamps
    for date_column in ['fecha_reserva', 'check-in', 'check-out']:
        df_cont = df_cont.drop(date_column, 1)
    return df_cont
        
df_cont = remove_non_usefull_features(df)

#### Porcen Reser vs Cancelacion por pais

In [None]:
res_por_pais = pd.pivot_table(ori,index=["pais_cliente"],values=["index"], aggfunc=lambda x: len(x.unique()))
can_por_pais = pd.pivot_table(ori,index=["pais_cliente"],values=["is_reserva_cancelada"], aggfunc=sum)
result = pd.concat([res_por_pais, can_por_pais], axis=1, join='inner',join_axes=[res_por_pais.index])
result["%"] = result["is_reserva_cancelada"]*100/result["index"]
result = result.sort_values("%", ascending=False)

In [None]:
res_por_fecha_reserva = pd.pivot_table(ori,index=["fecha_reserva"], values=["index"], aggfunc=lambda x: len(x.unique()))
can_por_fecha_reserva = pd.pivot_table(ori,index=["fecha_reserva"],values=["is_reserva_cancelada"], aggfunc=sum)
result_fecha_reserva = pd.concat([res_por_fecha_reserva, can_por_fecha_reserva], axis=1, join='inner',join_axes=[res_por_fecha_reserva.index])
result_fecha_reserva["%"] = result_fecha_reserva["is_reserva_cancelada"]*100/result_fecha_reserva["index"]
result_fecha_reserva.sort_values("%", ascending=False)

# Feature selection

De todas las características que hemos construido, vamos a evaluar como funcionan.

## Date extractions

In [None]:
pd.get_dummies(df.fecha_reserva)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

y_train = df_cont['is_reserva_cancelada']
X_train = df_cont.drop('is_reserva_cancelada', axis=1)


forest = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)


forest.fit(X_train, y_train)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
print(importances)
print(sorted(zip(importances, X_train.columns), key=lambda x: x[0]))
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()

# Construct classifier

In [None]:
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

# #############################################################################
# Data IO and generation

# Import some data to play with
X = X_train
y = y_train
n_samples, n_features = X.shape

# Add noisy features
random_state = np.random.RandomState(0)
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# #############################################################################
# Classification and ROC analysis

# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=3)
classifier = XGBClassifier()

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0
for train, test in cv.split(X, y):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Luck', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier()
clf.fit(X_train, y_train)

X_test = pd.read_csv('data/test_set.csv', sep=';')
X_test = create_extra_features(X_test)
X_test = remove_non_usefull_features(X_test)
#y_test = df_test['is_reserva_cancelada']
#X_test = df_test.drop('is_reserva_cancelada', axis=1)

clf.fit(X_train, y_train)

y_test_prob = clf.predict_proba(X_test)
y_test = clf.predict(X_test)

y_pred = list(map(lambda x:x[1], y_test_prob))

X_test['probabilidad'] = y_pred
X_test[['index', 'probabilidad']].to_csv('results.csv', index=False, sep=',')

X_test.describe().T