In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
%matplotlib inline

import pickle
import random as rnd
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import seaborn as sns
import sklearn
import statsmodels.api as sm
import xgboost as xgb
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails
from matplotlib import pyplot
from matplotlib.font_manager import FontProperties
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis,
                                           QuadraticDiscriminantAnalysis)
from sklearn.ensemble import (AdaBoostRegressor, BaggingClassifier,
                              BaggingRegressor, ExtraTreesRegressor,
                              GradientBoostingRegressor, RandomForestRegressor,
                              VotingRegressor)
from sklearn.feature_selection import (RFE, SelectFromModel, SelectKBest,
                                       VarianceThreshold, chi2)
from sklearn.inspection import permutation_importance
from sklearn.linear_model import (ElasticNet, HuberRegressor, Lasso,
                                  LinearRegression, RANSACRegressor, Ridge,
                                  SGDRegressor, TheilSenRegressor)
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, mean_absolute_error,
                             classification_report, confusion_matrix, log_loss,
                             precision_score, recall_score, roc_auc_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
                                     ShuffleSplit, StratifiedKFold,
                                     cross_val_score, cross_validate,
                                     train_test_split)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (Binarizer, LabelEncoder, MinMaxScaler,
                                   Normalizer, OneHotEncoder,
                                   PolynomialFeatures, StandardScaler,
                                   normalize)
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
df = pd.read_csv("../data/train_label.csv")

In [4]:
def transform(df):
    df["arrival_date"] = pd.to_datetime(df["arrival_date"], format="%Y-%m-%d")
    df["month"] = pd.Categorical(df["arrival_date"].dt.month_name())
    df["day"] = pd.Categorical(df["arrival_date"].dt.day)
    df["isWeekend"] = ((pd.DatetimeIndex(df["arrival_date"]).dayofweek) // 5 == 1).astype(float)
    df = df.drop(["arrival_date"], axis=1)
    df = pd.get_dummies(df, columns=["month"])
    
    return df

In [5]:
df = transform(df)

new_train_x = df.drop("label", axis=1).values
new_train_y = df["label"].values

scaler = StandardScaler()
scaler = scaler.fit(new_train_x)
new_train_x = scaler.transform(new_train_x)

In [6]:
df.head()

Unnamed: 0,label,day,isWeekend,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,2.0,1,0.0,0,0,0,0,0,1,0,0,0,0,0,0
1,1.0,2,0.0,0,0,0,0,0,1,0,0,0,0,0,0
2,1.0,3,0.0,0,0,0,0,0,1,0,0,0,0,0,0
3,1.0,4,1.0,0,0,0,0,0,1,0,0,0,0,0,0
4,1.0,5,1.0,0,0,0,0,0,1,0,0,0,0,0,0


In [7]:
classifiers = [
#                LinearRegression(),
#                ElasticNet(),
#                Lasso(),
               Ridge(),
#                SVR(),
#                LinearSVR(),
#                HuberRegressor(),
#                AdaBoostRegressor(),
               BaggingRegressor(),
               GradientBoostingRegressor(loss="lad", n_estimators=1000),
#                RandomForestRegressor(),
#                RandomForestRegressor(),
#                KNeighborsRegressor(),
               XGBRegressor()
               ]

from time import time

for clf in classifiers:
    before = time()
    print("="*30)
    print(clf.__class__.__name__)
    
    crossValScore = cross_validate(clf, new_train_x, new_train_y, scoring=('r2', 'neg_mean_absolute_error'), cv=5) 
    for metric, value in crossValScore.items():
        print(f"{metric}: {abs(np.mean(value)):.2f}")
    after = time()
    print(f"花費{(after - before):.1f}s")
    
print("="*30)

Ridge
fit_time: 0.00
score_time: 0.00
test_r2: 0.40
test_neg_mean_absolute_error: 1.09
花費0.0s
BaggingRegressor
fit_time: 0.02
score_time: 0.00
test_r2: 0.62
test_neg_mean_absolute_error: 1.20
花費0.1s
GradientBoostingRegressor
fit_time: 1.19
score_time: 0.00
test_r2: 0.41
test_neg_mean_absolute_error: 1.06
花費6.0s
XGBRegressor
fit_time: 0.03
score_time: 0.00
test_r2: 0.39
test_neg_mean_absolute_error: 1.10
花費0.1s


In [8]:
X_train, X_test, y_train, y_test = train_test_split(new_train_x, new_train_y, test_size=0.3, random_state=1126)
r1 = Ridge()
r2 = GradientBoostingRegressor(loss="lad", n_estimators=1000)
r3 = SVR()
r4 = LinearRegression()
r5 = BaggingRegressor()

clf = VotingRegressor([('ri', r1), ('gb', r2), ('xg', r3), ('r4', r4), ('r5', r5)])
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.8556569156588442

In [9]:
clf = Ridge()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mean_absolute_error(y_test, y_pred)

0.8394654257159296

In [10]:
testdf = pd.read_csv("../data/test_nolabel.csv")

In [11]:
testdf = transform(testdf)

In [12]:
for col in df.columns:
    if col not in testdf.columns:
        testdf[col] = 0

testdf = testdf[df.drop("label", axis=1).columns]

In [13]:
testdf.head()

Unnamed: 0,day,isWeekend,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,1,1.0,1,0,0,0,0,0,0,0,0,0,0,0
1,2,1.0,1,0,0,0,0,0,0,0,0,0,0,0
2,3,0.0,1,0,0,0,0,0,0,0,0,0,0,0
3,4,0.0,1,0,0,0,0,0,0,0,0,0,0,0
4,5,0.0,1,0,0,0,0,0,0,0,0,0,0,0


In [14]:
new_train_x = testdf.values

scaler = StandardScaler()
scaler = scaler.fit(new_train_x)
new_train_x = scaler.transform(new_train_x)

In [16]:
np.rint(clf.predict(testdf))

array([2., 2., 2., 2., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 4., 4., 3.,
       4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 5., 5., 2., 2., 2., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 4., 3., 3., 4., 4., 4., 4., 4.,
       4., 4., 4., 4., 4., 5., 5., 5., 5., 5., 2., 2., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 4., 4., 4., 4., 4., 4., 4., 4.,
       4., 4., 4., 4., 5., 5., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 5.,
       5., 5., 5., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 4., 4., 4.,
       4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 5., 5., 5., 5., 5., 5., 5.])

In [22]:
answer = pd.read_csv("../data/test_nolabel.csv")
answer["label"] = np.rint(clf.predict(testdf)).astype(int)

In [24]:
answer.to_csv("answer v1.csv", index=False)

In [32]:
answer["arrival_date"] = pd.to_datetime(answer["arrival_date"], format="%Y-%m-%d")
answer["arrival_date"].dt.weekday

0      5
1      6
2      0
3      1
4      2
5      3
6      4
7      5
8      6
9      0
10     1
11     2
12     3
13     4
14     5
15     6
16     0
17     1
18     2
19     3
20     4
21     5
22     6
23     0
24     1
25     2
26     3
27     4
28     5
29     6
30     0
31     1
32     2
33     3
34     4
35     5
36     6
37     0
38     1
39     2
40     3
41     4
42     5
43     6
44     0
45     1
46     2
47     3
48     4
49     5
50     6
51     0
52     1
53     2
54     3
55     4
56     5
57     6
58     0
59     1
60     2
61     3
62     4
63     5
64     6
65     0
66     1
67     2
68     3
69     4
70     5
71     6
72     0
73     1
74     2
75     3
76     4
77     5
78     6
79     0
80     1
81     2
82     3
83     4
84     5
85     6
86     0
87     1
88     2
89     3
90     4
91     5
92     6
93     0
94     1
95     2
96     3
97     4
98     5
99     6
100    0
101    1
102    2
103    3
104    4
105    5
106    6
107    0
108    1
109    2
110    3
1