In [1]:
%matplotlib inline

import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
df.drop(["ID", "lead_time", "arrival_date_week_number", "country", "market_segment", "distribution_channel",
              "is_repeated_guest", "previous_cancellations", "previous_bookings_not_canceled", "reserved_room_type",
             "assigned_room_type", "booking_changes", "deposit_type", "agent", "company", "days_in_waiting_list",
             "customer_type", "required_car_parking_spaces", "total_of_special_requests"], axis=1, inplace = True)

In [4]:
def create_datetime(year: pd.Series, month: pd.Series, date: pd.Series) -> pd.Series:
    return pd.to_datetime(year.astype(str) + month + date.astype(str), format="%Y%B%d")

In [5]:
df["date"] = create_datetime(df["arrival_date_year"], df["arrival_date_month"], df["arrival_date_day_of_month"])
# df.drop(["arrival_date_year", "arrival_date_month", "arrival_date_day_of_month", "reservation_status_date"], 
#         axis=1, inplace=True)
df.drop(["arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"], 
        axis=1, inplace=True)

In [6]:
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'], format="%Y-%m-%d") - timedelta(days=1)

In [7]:
%%time

df['date'] = [pd.date_range(s, e, freq='d') for s, e in
              zip(pd.to_datetime(df['date']), pd.to_datetime(df['reservation_status_date']))]
# df = df.explode('date').drop(['reservation_status_date', 'date'], axis=1)

Wall time: 12.1 s


In [8]:
df = df.explode("date").drop(["reservation_status_date"], axis=1).reset_index(drop=True)

In [9]:
df["stay_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"] 

In [10]:
traindf = df[df["stay_nights"] != 0]
traindf = traindf.drop(["stays_in_week_nights", "stays_in_weekend_nights", "is_canceled"], axis=1)
traindf = traindf.dropna().sort_values(by=['date'])

In [11]:
targetdf = pd.read_csv("../data/train_label.csv")
targetdf["arrival_date"] = pd.to_datetime(targetdf['arrival_date'], format="%Y-%m-%d")

In [12]:
targetdf.rename(columns ={"arrival_date": "date"}, inplace=True)

In [13]:
traindf.drop(["reservation_status", "stay_nights"], axis=1, inplace=True)

In [14]:
traindf = pd.get_dummies(traindf)
traindf = traindf.drop(["hotel_Resort Hotel"], axis=1)

In [15]:
traindf = traindf.groupby(["date", "hotel_City Hotel"]).agg("sum").reset_index()

In [16]:
resort = traindf[traindf["hotel_City Hotel"] == 0]
city = traindf[traindf["hotel_City Hotel"] == 1]

In [17]:
traindf = pd.merge(city, resort, how="inner", on="date", suffixes=["_city", "_resort"]).drop(["hotel_City Hotel_city", "hotel_City Hotel_resort"], axis=1)

In [18]:
traindf.sort_values(by=["date"])

Unnamed: 0,date,adults_city,children_city,babies_city,adr_city,meal_BB_city,meal_FB_city,meal_HB_city,meal_SC_city,meal_Undefined_city,adults_resort,children_resort,babies_resort,adr_resort,meal_BB_resort,meal_FB_resort,meal_HB_resort,meal_SC_resort,meal_Undefined_resort
0,2015-07-01,116,0.0,0,4606.847778,2,0,63,0,0,64,2.0,0,2524.922903,31,1,3,0,0
1,2015-07-02,117,0.0,0,4667.619385,2,0,64,0,0,125,4.0,0,4982.666867,51,2,10,0,0
2,2015-07-03,24,0.0,0,944.763400,2,0,10,1,0,161,5.0,0,6477.237311,64,1,15,0,0
3,2015-07-04,40,0.0,0,1433.803422,11,0,10,1,0,216,10.0,2,8460.044558,83,2,22,0,0
4,2015-07-05,23,0.0,0,705.340863,11,0,0,1,0,239,13.0,2,9936.871386,91,1,29,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,2017-04-07,5,1.0,0,295.232383,3,0,0,0,0,16,2.0,0,74.843755,5,0,2,2,0
647,2017-04-08,1,0.0,0,117.921205,1,0,0,0,0,14,2.0,0,100.678591,5,0,2,1,0
648,2017-04-09,1,0.0,0,117.921205,1,0,0,0,0,10,2.0,0,138.750222,3,0,2,1,0
649,2017-04-10,1,0.0,0,117.921205,1,0,0,0,0,5,0.0,0,-50.090363,2,0,0,1,0


In [19]:
targetdf

Unnamed: 0,date,label
0,2015-07-01,2.0
1,2015-07-02,1.0
2,2015-07-03,1.0
3,2015-07-04,1.0
4,2015-07-05,1.0
...,...,...
635,2017-03-27,2.0
636,2017-03-28,1.0
637,2017-03-29,2.0
638,2017-03-30,3.0


In [20]:
df = pd.merge(traindf, targetdf, how="inner", on="date")

In [21]:
df["month"] = pd.to_datetime(df["date"]).dt.month
df["date"] = pd.to_datetime(df["date"]).dt.date
df.drop(["date"], axis=1, inplace=True)

In [22]:
df = pd.get_dummies(df, columns=["month"])
X_train, y_train = df.drop(["label"], axis=1), df["label"]

In [23]:
X_train, y_train = df[df.columns[19:]], df["label"]
df[df.columns[19:]]

Unnamed: 0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
635,0,0,1,0,0,0,0,0,0,0,0,0
636,0,0,1,0,0,0,0,0,0,0,0,0
637,0,0,1,0,0,0,0,0,0,0,0,0
638,0,0,1,0,0,0,0,0,0,0,0,0


In [24]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

model = Lasso()
scores = cross_val_score(model, X_train, y_train, scoring="neg_mean_absolute_error", cv=5)

In [25]:
scores.mean()

-1.23775634765625