In [220]:
import pickle
import random as rnd
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import seaborn as sns
import sklearn
import statsmodels.api as sm
import xgboost as xgb
from matplotlib import pyplot
from matplotlib.font_manager import FontProperties
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis,
                                           QuadraticDiscriminantAnalysis)
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              ExtraTreesClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, VotingClassifier)
from sklearn.feature_selection import (RFE, SelectFromModel, SelectKBest,
                                       VarianceThreshold, chi2)
from sklearn.inspection import permutation_importance
from sklearn.linear_model import (LinearRegression, LogisticRegression, PassiveAggressiveClassifier,
                                  Perceptron, RidgeClassifier, SGDClassifier)
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                             classification_report, confusion_matrix, log_loss,
                             precision_score, recall_score, roc_auc_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
                                     ShuffleSplit, StratifiedKFold,
                                     cross_val_score, train_test_split)
from sklearn.naive_bayes import (BernoulliNB, ComplementNB, GaussianNB,
                                 MultinomialNB)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (Binarizer, LabelEncoder, MinMaxScaler,
                                   Normalizer, OneHotEncoder,
                                   PolynomialFeatures, StandardScaler,
                                   normalize)
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [234]:
traindf_x = pd.read_csv("../data/train.csv", index_col="ID").reset_index()
traindf_y = pd.read_csv("../data/train_label.csv")
testdf_x = pd.read_csv("../data/test.csv", index_col="ID").reset_index()
testdf_y = pd.read_csv("../data/test_nolabel.csv")

adr = pd.read_csv("../data/TestAdr.csv")
is_canceled = pd.read_csv("../data/TestIsCanceled.csv")

In [237]:
testdf_x["adr"] = adr["adr"]
testdf_x["is_canceled"] = is_canceled["is_canceled"]

In [238]:
traindf_x.sample(2)

Unnamed: 0,ID,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
21770,21770,City Hotel,0,41,2015,December,53,30,0,3,2,0.0,0,BB,FRA,Direct,Direct,0,0,0,D,D,0,No Deposit,,,0,Transient,117.159334,0,0,Check-Out,2016-01-02
31850,31850,City Hotel,1,161,2016,March,13,25,1,2,2,0.0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,A,A,0,Non Refund,12.0,,101,Transient,116.977717,0,0,Canceled,2016-02-10


In [239]:
def create_datetime(year: pd.Series, month: pd.Series, date: pd.Series) -> pd.Series:
    return pd.to_datetime(year.astype(str) + month + date.astype(str), format="%Y%B%d")

def year_month_to_date(df):
    df["date"] = create_datetime(df["arrival_date_year"], df["arrival_date_month"], df["arrival_date_day_of_month"])
    df.drop(["arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"], 
        axis=1, inplace=True)
    return df

In [240]:
traindf_x = year_month_to_date(traindf_x)
testdf_x = year_month_to_date(testdf_x)

traindf_y["arrival_date"] = pd.to_datetime(traindf_y["arrival_date"])
testdf_y["arrival_date"] = pd.to_datetime(testdf_y["arrival_date"])

In [241]:
testdf_x.sample(2)

Unnamed: 0,ID,hotel,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,adr,is_canceled,date
24423,115954,City Hotel,180,32,0,1,2,0.0,0,BB,ESP,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,9.0,,0,Transient,0,0,5.25722,1,2017-08-10
16378,107909,Resort Hotel,123,25,2,2,1,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,0,1,43.316644,1,2017-06-24


In [242]:
def stay_nights(df):
    return df["stays_in_weekend_nights"] + df["stays_in_week_nights"]

traindf_x["stay_nights"] = stay_nights(traindf_x)
testdf_x["stay_nights"] = stay_nights(testdf_x)

traindf_x = traindf_x[traindf_x["stay_nights"] != 0]
testdf_x = testdf_x[testdf_x["stay_nights"] != 0]

In [243]:
def revenue(df):
    return df["adr"] * df["stay_nights"]

traindf_x["revenue"] = revenue(traindf_x)
testdf_x["revenue"] = revenue(testdf_x)

In [244]:
def true_adr(df):
    return df["adr"] * (1 - df["is_canceled"])

traindf_x["adr"] = true_adr(traindf_x)
testdf_x["adr"] = true_adr(testdf_x)

In [245]:
traindf = traindf_y.merge(traindf_x.groupby("date").agg("sum")["adr"].reset_index(level=0), how="left",
         left_on="arrival_date", right_on="date")
testdf = testdf_y.merge(testdf_x.groupby("date").agg("sum")["adr"].reset_index(level=0), how="left",
         left_on="arrival_date", right_on="date")

In [264]:
traindf.groupby("date").agg("mean").sample(30).sort_values(["label", "adr"])

Unnamed: 0_level_0,label,adr
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-12-09,0.0,2896.581008
2015-11-21,0.0,3159.983495
2017-02-01,0.0,3270.636421
2016-01-16,0.0,3355.682151
2015-12-23,1.0,2869.541468
2016-02-04,1.0,4432.964446
2015-11-06,1.0,5152.259409
2015-10-13,1.0,6236.984084
2017-01-14,1.0,7297.197693
2015-07-16,2.0,4642.508637


In [247]:
new_train_x = traindf[["adr"]]
new_train_y = traindf["label"].replace([6, 7, 8, 9], 6).astype("int64")

Unnamed: 0,adr
0,7189.494350
1,3098.361671
2,3270.453785
3,3383.036737
4,3614.545744
...,...
635,9455.623924
636,4502.192254
637,8861.204290
638,9342.154704


In [248]:
def model_Selection():
    classifiers = [XGBClassifier(),
                   ExtraTreesClassifier(),
                   DecisionTreeClassifier(),
                   RandomForestClassifier(),
                   RidgeClassifier(),
                   QuadraticDiscriminantAnalysis(),
                   ComplementNB(),
                   BernoulliNB(),
                   GaussianNB(),
                   MultinomialNB(),
                   LinearDiscriminantAnalysis(),
                   SGDClassifier(),
                   AdaBoostClassifier(),
                   BaggingClassifier(),
                   LogisticRegression(),
                   LinearSVC(),
                   KNeighborsClassifier()
                   ]

    from time import time

    for clf in classifiers:

        before = time()
        print("="*30)
        print(clf.__class__.__name__)

        skf = StratifiedKFold(n_splits=5, shuffle=True)
        crossValScore = cross_val_score(clf, new_train_x, new_train_y, scoring='neg_mean_absolute_error', cv=skf)  
        crossValScore_mean = abs(np.mean(crossValScore)) 
        print("Stratified Cross Validation Score, balanced_accuracy: {:.4}".format(crossValScore_mean))
        after = time()
        print(f"花費{(after - before):.1f}s")

    print("="*30)
    
#model_Selection

In [250]:
clf = LinearDiscriminantAnalysis()
clf.fit(new_train_x, new_train_y)
y_pred = clf.predict(testdf[["adr"]])

In [251]:
y_pred

array([1, 1, 2, 0, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 0, 1, 1, 3, 3, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2,
       2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2,
       1, 1, 2, 2, 1, 3, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2,
       2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 3, 1, 2, 1, 0,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 0, 1],
      dtype=int64)

In [252]:
answer_label = pd.DataFrame(y_pred)
pd.concat([testdf_y, answer_label], axis=1).to_csv("../data/answer.csv", header=["arrival_date", "label"], index=False)