In [3]:
import warnings

warnings.filterwarnings('ignore')

In [4]:
%matplotlib inline

import pickle
import random as rnd
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import seaborn as sns
import sklearn
import statsmodels.api as sm
import xgboost as xgb
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails
from matplotlib import pyplot
from matplotlib.font_manager import FontProperties
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis,
                                           QuadraticDiscriminantAnalysis)
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              ExtraTreesClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, VotingClassifier)
from sklearn.feature_selection import (RFE, SelectFromModel, SelectKBest,
                                       VarianceThreshold, chi2)
from sklearn.inspection import permutation_importance
from sklearn.linear_model import (LinearRegression, LogisticRegression, PassiveAggressiveClassifier,
                                  Perceptron, RidgeClassifier, SGDClassifier)
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                             classification_report, confusion_matrix, log_loss,
                             precision_score, recall_score, roc_auc_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
                                     ShuffleSplit, StratifiedKFold,
                                     cross_val_score, train_test_split)
from sklearn.naive_bayes import (BernoulliNB, ComplementNB, GaussianNB,
                                 MultinomialNB)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (Binarizer, LabelEncoder, MinMaxScaler,
                                   Normalizer, OneHotEncoder,
                                   PolynomialFeatures, StandardScaler,
                                   normalize)
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from lightgbm import LGBMClassifier
from math import exp, log

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [39]:
traindf = pd.read_csv("data/train_for_predict_adr.csv")
train_label = pd.read_csv("data/train_label.csv")

In [40]:
traindf["revenue"] = traindf["adr"] * traindf["total_nights"]
revenue = traindf.groupby("date")["revenue"].agg("sum")

In [41]:
train_label = train_label.merge(revenue, how="left", left_on="arrival_date", right_on="date")
# train_label["lnrevenue"] = train_label["revenue"].apply(lambda x: log(x))

In [42]:
train_label.groupby(["label"])["revenue"].agg("mean")

label
0.0     6819.151760
1.0    15403.070536
2.0    24462.025740
3.0    34960.375126
4.0    44536.673602
5.0    54781.168893
6.0    64363.154735
7.0    78092.052308
8.0    86142.880574
9.0    95449.120935
Name: revenue, dtype: float64

In [43]:
new_train_x = train_label.drop(["arrival_date", "label"], axis=1)
new_train_y = train_label["label"]

In [44]:
classifiers = [
               DecisionTreeClassifier(),
               XGBClassifier(),
               ExtraTreesClassifier(),
               RandomForestClassifier(),
               RidgeClassifier(),
               QuadraticDiscriminantAnalysis(),
               ComplementNB(),
               BernoulliNB(),
               GaussianNB(),
               MultinomialNB(),
               LinearDiscriminantAnalysis(),
               SGDClassifier(),
               AdaBoostClassifier(),
               BaggingClassifier(),
               LinearSVC(),
               KNeighborsClassifier(),
               XGBClassifier(),
               XGBRFClassifier(),
               LGBMClassifier()
               ]

from time import time

for clf in classifiers:
    
    before = time()
    print("="*30)
    print(clf.__class__.__name__)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    crossValScore = cross_val_score(clf, new_train_x, new_train_y, scoring='neg_mean_absolute_error', cv=skf)  
    crossValScore_mean = abs(np.mean(crossValScore)) 
    print("mean_absolute_error: {:.4}".format(crossValScore_mean))
    after = time()
    print(f"花費{(after - before):.1f}s")
    
print("="*30)

DecisionTreeClassifier
mean_absolute_error: 0.03438
花費0.0s
XGBClassifier
mean_absolute_error: 0.02656
花費0.4s
ExtraTreesClassifier
mean_absolute_error: 0.02969
花費0.5s
RandomForestClassifier
mean_absolute_error: 0.03125
花費0.6s
RidgeClassifier
mean_absolute_error: 0.4891
花費0.0s
QuadraticDiscriminantAnalysis
mean_absolute_error: nan
花費0.0s
ComplementNB
mean_absolute_error: 2.023
花費0.0s
BernoulliNB
mean_absolute_error: 1.086
花費0.0s
GaussianNB
mean_absolute_error: 0.04375
花費0.0s
MultinomialNB
mean_absolute_error: 1.086
花費0.0s
LinearDiscriminantAnalysis
mean_absolute_error: 0.04531
花費0.0s
SGDClassifier
mean_absolute_error: 2.752
花費0.1s
AdaBoostClassifier
mean_absolute_error: 0.4766
花費0.4s
BaggingClassifier
mean_absolute_error: 0.02969
花費0.1s
LinearSVC
mean_absolute_error: 2.148
花費0.4s
KNeighborsClassifier
mean_absolute_error: 0.03438
花費0.0s
XGBClassifier
mean_absolute_error: 0.02969
花費0.4s
XGBRFClassifier
mean_absolute_error: 0.02969
花費0.3s
LGBMClassifier
mean_absolute_error: 0.05156
花費1.5s


In [45]:
X_train, X_test, y_train, y_test = train_test_split(new_train_x, new_train_y, test_size=0.3)
model = XGBRFClassifier()
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Mean absolute error: {metrics.mean_absolute_error(y_test, y_pred):.2f}")

Mean absolute error: 0.03


In [60]:
testdf = pd.read_csv("data/test_for_predict_adr.csv")
test_label = pd.read_csv("data/test_nolabel.csv")
test_adr = pd.read_csv("data/test_adr.csv")

In [61]:
testdf["adr"] = test_adr.values
testdf["revenue"] = testdf["adr"] * testdf["total_nights"]
revenue = testdf.groupby("date")["revenue"].agg("sum")
test_label = test_label.merge(revenue, how="left", left_on="arrival_date", right_on="date")

In [63]:
test_x = test_label.drop(["arrival_date"], axis=1)

In [66]:
test_label["label"] = model.predict(test_x)

In [76]:
test_label[["arrival_date", "label"]].to_csv("answer/answerv2.csv", index=False)