In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
def create_time_feature(df: pd.DataFrame, year: pd.Series, month: pd.Series, date: pd.Series) -> pd.Series:
    data = df.copy()
    data["date"] = pd.to_datetime(year.astype(str) + month + date.astype(str), format="%Y%B%d")
    data = pd.get_dummies(data, columns=["arrival_date_month"])
    data["weekday"] = data["date"].dt.weekday
    data["is_weekend"] = ((data["weekday"] == 5) | (data["weekday"] == 6)).astype("float")
    data = data.drop(["arrival_date_year", "arrival_date_day_of_month", "weekday"], axis=1)
    
    return data


def country_adr_level(traindf):
    tmp = traindf[["country", "adr"]].groupby("country").agg({'country': 'count', 'adr': 'mean'})
    lowAdrCountry = set(tmp[tmp.adr < 80]["country"].index)
    midAdrCountry = set(tmp[(tmp.adr <= 90) & (tmp.adr >= 80)]["country"].index)
    highAdrCountry = set(tmp[tmp.adr > 90]["country"].index)
    
    traindf["lowAdrCountry"] = traindf["country"].apply(lambda x: 1 if x in lowAdrCountry else 0)
    traindf["midAdrCountry"] = traindf["country"].apply(lambda x: 1 if x in midAdrCountry else 0)
    traindf["highAdrCountry"] = traindf["country"].apply(lambda x: 1 if x in highAdrCountry else 0)
    traindf = traindf.drop(["country"], axis=1)
    
    return traindf


def agent_adr_level(traindf):
    tmp = traindf[["agent", "adr"]].groupby("agent").agg({'agent': 'count', 'adr': 'mean'})
    lowAdrCountry = set(tmp[tmp.adr < 100]["agent"].index)
    highAdrCountry = set(tmp[tmp.adr >= 100]["agent"].index)
    
    traindf["lowAdrCountry"] = traindf["agent"].apply(lambda x: 1 if x in lowAdrCountry else 0)
    traindf["highAdrCountry"] = traindf["agent"].apply(lambda x: 1 if x in highAdrCountry else 0)
    traindf = traindf.drop(["agent"], axis=1)
    
    return traindf


def showNA(df):
    tmp = pd.to_numeric((df.isnull().sum() / df.shape[0]).map('{:,.2f}'.format))
    print(tmp[tmp > 0])
    

def fill_missing_col(df, data):
    dfCol = set(df.columns) - set(["adr"])
    dataCol = set(data.columns)
    
    for col in dfCol:
        if col not in dataCol:
            data[col] = 0
            
    for col in dataCol:
        if col not in dfCol:
            data = data.drop([col], axis=1)
            
    return data


def reorder_column(df, testdf, col):
    return testdf[df.drop([col], axis=1).columns]

In [3]:
traindf = pd.read_csv('../data/train.csv', index_col="ID")
traindf = traindf.drop(['reservation_status', 'reservation_status_date'], axis=1)
traindf = traindf[(traindf["adr"] > 0) & (traindf["adr"] < 400)]
traindf = traindf[traindf["is_canceled"] == 0]

In [4]:
def feature_transform(data):
    df = data.copy()
    df["children"] = df["children"].fillna(0)
    df["country"] = df["country"].fillna(df["country"].mode().index[0])
    df["agent"] = df["agent"].fillna("0")
    df = df.drop(["company"], axis=1)
    df = create_time_feature(df, df["arrival_date_year"],
                             df["arrival_date_month"], df["arrival_date_day_of_month"])
    df["hotel"] = df["hotel"].map({"Resort Hotel": 0, "City Hotel": 1})
    df["is_same_room"] = (df["reserved_room_type"] == df["assigned_room_type"]).map({True: 1, False: 0})
    reserved = pd.get_dummies(df.reserved_room_type, prefix="reserved")
    assigned = pd.get_dummies(df.assigned_room_type, prefix="assigned")
    df = pd.concat([df, reserved, assigned], axis=1, join="inner")
    df = df.drop(["reserved_room_type", "assigned_room_type"], axis=1)
    df["total_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]
    df = df[df["total_nights"] > 0]
    df["long_stay"] = (df["total_nights"] > 5).astype("float")
    df["total_customers"] = df["adults"] + df["children"] + df["babies"]
    df = df[df["total_customers"] <= 5]
    df["children"] = df["children"] + df["babies"]
    df = df.drop(["babies"], axis=1)
    df = country_adr_level(df)
    df["total_previos_booking"] = df["previous_cancellations"] + df["previous_bookings_not_canceled"]
    df = agent_adr_level(df)
    
    
    return df

In [None]:
traindf = feature_transform(traindf)
traindf = pd.get_dummies(traindf)
traindf = traindf.drop(["date"], axis=1)

In [None]:
df = pd.read_csv("adr_data.csv")
df.shape == traindf.shape

In [None]:
testdf = pd.read_csv('../data/test.csv', index_col="ID")
is_canceled = pd.read_csv("../data/TestIsCanceled.csv")
testdf["is_canceled"] = is_canceled["is_canceled"].values

In [None]:
testdf = feature_transform(testdf)
testdf = pd.get_dummies(testdf)
testdf = testdf.drop(["date"], axis=1)
testdf = fill_missing_col(df, testdf)
testdf = reorder_column(df, testdf, "adr")

In [None]:
testdf.to_csv("TestADR.csv", index=False)