In [45]:
%matplotlib inline

import pickle
import random as rnd
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import seaborn as sns
import sklearn
import statsmodels.api as sm
import xgboost as xgb
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails
from matplotlib import pyplot
from matplotlib.font_manager import FontProperties
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis,
                                           QuadraticDiscriminantAnalysis)
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              ExtraTreesClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, VotingClassifier)
from sklearn.feature_selection import (RFE, SelectFromModel, SelectKBest,
                                       VarianceThreshold, chi2)
from sklearn.inspection import permutation_importance
from sklearn.linear_model import (LinearRegression, LogisticRegression, PassiveAggressiveClassifier,
                                  Perceptron, RidgeClassifier, SGDClassifier)
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                             classification_report, confusion_matrix, log_loss,
                             precision_score, recall_score, roc_auc_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
                                     ShuffleSplit, StratifiedKFold,
                                     cross_val_score, train_test_split)
from sklearn.naive_bayes import (BernoulliNB, ComplementNB, GaussianNB,
                                 MultinomialNB)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (Binarizer, LabelEncoder, MinMaxScaler,
                                   Normalizer, OneHotEncoder,
                                   PolynomialFeatures, StandardScaler,
                                   normalize)
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
df.drop(["ID", "lead_time", "arrival_date_week_number", "country", "market_segment", "distribution_channel",
              "is_repeated_guest", "previous_cancellations", "previous_bookings_not_canceled", "reserved_room_type",
             "assigned_room_type", "booking_changes", "deposit_type", "agent", "company", "days_in_waiting_list",
             "customer_type", "required_car_parking_spaces", "total_of_special_requests"], axis=1, inplace = True)

In [4]:
def create_datetime(year: pd.Series, month: pd.Series, date: pd.Series) -> pd.Series:
    return pd.to_datetime(year.astype(str) + month + date.astype(str), format="%Y%B%d")

In [5]:
df["date"] = create_datetime(df["arrival_date_year"], df["arrival_date_month"], df["arrival_date_day_of_month"])
# df.drop(["arrival_date_year", "arrival_date_month", "arrival_date_day_of_month", "reservation_status_date"], 
#         axis=1, inplace=True)
df.drop(["arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"], 
        axis=1, inplace=True)

In [6]:
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'], format="%Y-%m-%d") - timedelta(days=1)

In [7]:
%%time

df['date'] = [pd.date_range(s, e, freq='d') for s, e in
              zip(pd.to_datetime(df['date']), pd.to_datetime(df['reservation_status_date']))]
# df = df.explode('date').drop(['reservation_status_date', 'date'], axis=1)

CPU times: user 11.3 s, sys: 125 ms, total: 11.4 s
Wall time: 11.4 s


In [8]:
df = df.explode("date").drop(["reservation_status_date"], axis=1).reset_index(drop=True)

In [9]:
df["stay_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"] 

In [10]:
traindf = df[df["stay_nights"] != 0]
traindf = traindf.drop(["stays_in_week_nights", "stays_in_weekend_nights", "is_canceled"], axis=1)
traindf = traindf.dropna().sort_values(by=['date'])

In [11]:
traindf.head()

Unnamed: 0,hotel,adults,children,babies,meal,adr,reservation_status,date,stay_nights
1,City Hotel,1,0.0,0,HB,75.052227,Check-Out,2015-07-01,2
216,Resort Hotel,2,0.0,0,BB,73.741907,Check-Out,2015-07-01,4
129,Resort Hotel,2,0.0,0,BB,32.849116,Check-Out,2015-07-01,4
220,Resort Hotel,2,0.0,0,HB,119.570912,Check-Out,2015-07-01,4
124,Resort Hotel,3,0.0,0,BB,42.308216,Check-Out,2015-07-01,5


In [12]:
targetdf = pd.read_csv("../data/train_label.csv")

In [13]:
targetdf["arrival_date"] = pd.to_datetime(targetdf["arrival_date"], format='%Y-%m-%d')

In [38]:
data = pd.merge(traindf, targetdf, how="inner", left_on="date", right_on="arrival_date")
data = data.drop(["reservation_status"], axis=1)
data.head()

Unnamed: 0,hotel,adults,children,babies,meal,adr,date,stay_nights,arrival_date,label
0,City Hotel,1,0.0,0,HB,75.052227,2015-07-01,2,2015-07-01,2.0
1,Resort Hotel,2,0.0,0,BB,73.741907,2015-07-01,4,2015-07-01,2.0
2,Resort Hotel,2,0.0,0,BB,32.849116,2015-07-01,4,2015-07-01,2.0
3,Resort Hotel,2,0.0,0,HB,119.570912,2015-07-01,4,2015-07-01,2.0
4,Resort Hotel,3,0.0,0,BB,42.308216,2015-07-01,5,2015-07-01,2.0


In [39]:
data["month"] = pd.Categorical(data["date"].dt.month_name())
data["day"] = pd.Categorical(data["date"].dt.day)
data["isWeekend"] = ((pd.DatetimeIndex(data["date"]).dayofweek) // 5 == 1).astype(int)
data = data.drop(["arrival_date", "date"], axis=1)

data["family"] = data["adults"] + data["children"] + data["babies"]
data["kids"] = data["children"] + data["babies"]
data = data.drop(["adults", "children", "babies"], axis=1)

In [40]:
data.sample(5)

Unnamed: 0,hotel,meal,adr,stay_nights,label,month,day,isWeekend,family,kids
138529,City Hotel,HB,139.037292,2,3.0,September,30,0,2.0,0.0
170746,City Hotel,SC,74.349627,7,1.0,January,15,1,2.0,0.0
70732,Resort Hotel,BB,29.773701,4,2.0,April,6,0,2.0,0.0
126653,City Hotel,BB,140.540672,5,3.0,August,31,0,3.0,1.0
122890,Resort Hotel,BB,198.733507,8,7.0,August,22,0,2.0,0.0


In [53]:
df = pd.get_dummies(data[["month", "isWeekend", "adr", "label"]])
new_train_x = df.drop("label", axis=1).values
new_train_y = df["label"].values

In [55]:
df.sample(5)

Unnamed: 0,isWeekend,adr,label,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
130158,0,35.163454,3.0,0,0,0,0,0,0,0,0,0,0,0,1
148978,0,108.369482,2.0,0,0,0,0,0,0,0,0,0,0,1,0
121591,0,123.922144,4.0,0,1,0,0,0,0,0,0,0,0,0,0
173354,0,80.106901,1.0,0,0,0,0,1,0,0,0,0,0,0,0
33833,0,78.391426,2.0,0,0,0,0,0,0,0,0,0,0,1,0


In [57]:
classifiers = [DecisionTreeClassifier(),
               XGBClassifier(),
               ExtraTreesClassifier(),
               RandomForestClassifier(),
               RidgeClassifier(),
               QuadraticDiscriminantAnalysis(),
               ComplementNB(),
               BernoulliNB(),
               GaussianNB(),
               MultinomialNB(),
               LinearDiscriminantAnalysis(),
               SGDClassifier(),
               AdaBoostClassifier(),
               BaggingClassifier(),
               LinearSVC(),
               KNeighborsClassifier()
               ]

from time import time

for clf in classifiers:
    
    before = time()
    print("="*30)
    print(clf.__class__.__name__)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    crossValScore = cross_val_score(clf, new_train_x, new_train_y, scoring='balanced_accuracy', cv=skf)  
    crossValScore_mean = np.mean(crossValScore) 
    print("Stratified Cross Validation Score, balanced_accuracy: {:.4}".format(crossValScore_mean))
    after = time()
    print(f"花費{(after - before):.1f}s")
    
print("="*30)

DecisionTreeClassifier
Stratified Cross Validation Score, balanced_accuracy: 0.1998
花費3.1s
XGBClassifier


KeyboardInterrupt: 