In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
def create_time_feature(df: pd.DataFrame, year: pd.Series, month: pd.Series, date: pd.Series) -> pd.Series:
    data = df.copy()
    data["date"] = pd.to_datetime(year.astype(str) + month + date.astype(str), format="%Y%B%d")
    data = pd.get_dummies(data, columns=["arrival_date_month"])
    data["weekday"] = data["date"].dt.weekday
    data["is_weekend"] = ((data["weekday"] == 5) | (data["weekday"] == 6)).astype("float")
    data = data.drop(["arrival_date_year", "arrival_date_day_of_month", "weekday"], axis=1)
    
    return data


def country_adr_level(scaler, traindf):
    tmp = scaler[["country", "adr"]].groupby("country").agg({'country': 'count', 'adr': 'mean'})
    lowAdrCountry = set(tmp[tmp.adr < 80]["country"].index)
    midAdrCountry = set(tmp[(tmp.adr <= 90) & (tmp.adr >= 80)]["country"].index)
    highAdrCountry = set(tmp[tmp.adr > 90]["country"].index)
    
    traindf["lowAdrCountry"] = traindf["country"].apply(lambda x: 1 if x in lowAdrCountry else 0)
    traindf["midAdrCountry"] = traindf["country"].apply(lambda x: 1 if x in midAdrCountry else 0)
    traindf["highAdrCountry"] = traindf["country"].apply(lambda x: 1 if x in highAdrCountry else 0)
    traindf = traindf.drop(["country"], axis=1)
    
    return traindf


def agent_adr_level(scaler, traindf):
    tmp = scaler[["agent", "adr"]].groupby("agent").agg({'agent': 'count', 'adr': 'mean'})
    lowAdrCountry = set(tmp[tmp.adr < 100]["agent"].index)
    highAdrCountry = set(tmp[tmp.adr >= 100]["agent"].index)
    
    traindf["lowAdrCountry"] = traindf["agent"].apply(lambda x: 1 if x in lowAdrCountry else 0)
    traindf["highAdrCountry"] = traindf["agent"].apply(lambda x: 1 if x in highAdrCountry else 0)
    traindf = traindf.drop(["agent"], axis=1)
    
    return traindf


def showNA(df):
    tmp = pd.to_numeric((df.isnull().sum() / df.shape[0]).map('{:,.2f}'.format))
    print(tmp[tmp > 0])
    

def fill_missing_col(traindf, testdf):
    missingCol = list(set(traindf.columns) - set(testdf.columns))
    duplicatedCol = list(set(testdf.columns) - set(traindf.columns))
    
    for col in missingCol:
        testdf[col] = 0
            
    for col in duplicatedCol:
        testdf = testdf.drop([col], axis=1)
            
    testdf = testdf[traindf.columns]
    return testdf


def reorder_column(df, testdf, col):
    return testdf[df.drop([col], axis=1).columns]


def feature_transform(scaler, data):
    df = data.copy()
    df["children"] = df["children"].fillna(0)
    df["country"] = df["country"].fillna(df["country"].mode().index[0])
    df["agent"] = df["agent"].fillna("0")
    df = df.drop(["company"], axis=1)
    df = create_time_feature(df, df["arrival_date_year"],
                             df["arrival_date_month"], df["arrival_date_day_of_month"])
    df["hotel"] = df["hotel"].map({"Resort Hotel": 0, "City Hotel": 1})
    df["is_same_room"] = (df["reserved_room_type"] == df["assigned_room_type"]).map({True: 1, False: 0})
    reserved = pd.get_dummies(df.reserved_room_type, prefix="reserved")
    assigned = pd.get_dummies(df.assigned_room_type, prefix="assigned")
    df = pd.concat([df, reserved, assigned], axis=1, join="inner")
    df = df.drop(["reserved_room_type", "assigned_room_type"], axis=1)
    df["total_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]
    df = df[df["total_nights"] > 0]
    df["long_stay"] = (df["total_nights"] > 5).astype("float")
    df["total_customers"] = df["adults"] + df["children"] + df["babies"]
    df = df[df["total_customers"] <= 5]
    df["children"] = df["children"] + df["babies"]
    df = df.drop(["babies"], axis=1)
    df = country_adr_level(scaler, df)
    df["total_previos_booking"] = df["previous_cancellations"] + df["previous_bookings_not_canceled"]
    df = agent_adr_level(scaler, df)
    
    
    return df

In [3]:
traindf = pd.read_csv('data/train.csv', index_col="ID")
traindf = traindf.drop(['reservation_status', 'reservation_status_date'], axis=1)
traindf = traindf[(traindf["adr"] > 0) & (traindf["adr"] < 400)]
traindf = traindf[traindf["is_canceled"] == 0]

scaler = traindf.copy()
traindf = feature_transform(scaler, traindf)
traindf = pd.get_dummies(traindf)

In [4]:
traindf.head()

Unnamed: 0_level_0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,date,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,arrival_date_month_May,arrival_date_month_November,arrival_date_month_October,arrival_date_month_September,is_weekend,is_same_room,reserved_A,reserved_B,reserved_C,reserved_D,reserved_E,reserved_F,reserved_G,reserved_H,reserved_L,assigned_A,assigned_B,assigned_C,assigned_D,assigned_E,assigned_F,assigned_G,assigned_H,assigned_I,assigned_K,total_nights,long_stay,total_customers,lowAdrCountry,midAdrCountry,highAdrCountry,total_previos_booking,meal_BB,meal_FB,meal_HB,meal_SC,meal_Undefined,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,distribution_channel_Corporate,distribution_channel_Direct,distribution_channel_GDS,distribution_channel_TA/TO,distribution_channel_Undefined,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,1,0,257,27,0,2,1,0.0,0,0,0,1,0,75.052227,0,0,2015-07-01,0,0,0,0,0,1,0,0,0,0,0,0,0.0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0.0,1.0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0
2,1,0,257,27,0,2,2,0.0,0,0,0,0,0,74.546401,0,0,2015-07-01,0,0,0,0,0,1,0,0,0,0,0,0,0.0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0.0,2.0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0
3,1,0,257,27,0,2,2,0.0,0,0,0,0,0,76.376288,0,0,2015-07-01,0,0,0,0,0,1,0,0,0,0,0,0,0.0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0.0,2.0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0
4,1,0,257,27,0,2,2,0.0,0,0,0,0,0,49.411647,0,0,2015-07-01,0,0,0,0,0,1,0,0,0,0,0,0,0.0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0.0,2.0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0
5,1,0,257,27,0,2,2,0.0,0,0,0,0,0,92.832887,0,0,2015-07-01,0,0,0,0,0,1,0,0,0,0,0,0,0.0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0.0,2.0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0


In [5]:
testdf = pd.read_csv('data/test.csv', index_col="ID")
testCancelLabel = pd.read_csv("data/test_iscancel.csv")
testdf["is_canceled"] = testCancelLabel.values
testdf = testdf[testdf["is_canceled"] == 0]

In [6]:
testdf = feature_transform(scaler, testdf)
testdf = pd.get_dummies(testdf)
testdf = fill_missing_col(traindf, testdf)

In [7]:
traindf.to_csv("data/train_for_predict_adr.csv", index=False)
testdf.to_csv("data/test_for_predict_adr.csv", index=False)