In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import ExtraTreesRegressor
import xgboost as xgb

import sys
sys.path.append('../')

from src.functions import Functions
from src.modeling import Modeling

import warnings
warnings.filterwarnings('ignore')

In [2]:
features = pd.read_csv('../data/raw_data/features.csv')
sales_train = pd.read_csv('../data/raw_data/train.csv')
stores = pd.read_csv('../data/raw_data/stores.csv')
sales_test = pd.read_csv('../data/raw_data/test.csv')

features['IsBeforeHoliday'] = features['IsHoliday'].shift(-1)
features['IsBeforeHoliday'].iloc[-1] = False

#### 欠損値の穴埋め

In [3]:
features = Functions.rollingInterpolation(features)

#### train, testのマージ

In [4]:
df_all_train = sales_train.merge(features, how="left", on=["Store","Date","IsHoliday"]).merge(stores, how="left", on=["Store"])
df_all_test = sales_test.merge(features, how="left", on=["Store","Date","IsHoliday"]).merge(stores, how="left", on=["Store"])

df_all_train['train_or_test'] = 'train'
df_all_test['train_or_test'] = 'test'
merged_df = pd.concat([df_all_train, df_all_test], sort=False)

merged_df = merged_df.sort_values(by=['Store','Dept','Date'], axis=0).reset_index().drop(columns='index')

#### カテゴリカルエンコーディング

In [5]:
merged_df = pd.get_dummies(merged_df, columns=["Type"])

LE = LabelEncoder()
merged_df['IsHoliday'] = LE.fit_transform(merged_df['IsHoliday'])

#### 休日の追加

In [6]:
merged_df = Functions.appendHolidayFlag(merged_df)

#### 時間に関する特徴量の追加

In [7]:
merged_df = Functions.addDateFeature(merged_df)

#### ラグ特徴量の追加

In [8]:
merged_df = Functions.createLagFeatures(merged_df)

#### 時間ごとの統計量に関する特徴量の追加

In [9]:
merged_df =  Functions.createDateStatsFeatures(merged_df, 'Monthly_Sales_', ['Store','Dept', 'Month'])
merged_df =  Functions.createDateStatsFeatures(merged_df, 'Daily_Sales_', ['Store','Dept', 'Day'])
merged_df =  Functions.createDateStatsFeatures(merged_df, 'WeekofMonth_Sales_', ['Store','Dept', 'WeekofMonth'])
merged_df =  Functions.createDateStatsFeatures(merged_df, 'WeekofYear_Sales_', ['Store','Dept', 'WeekofYear'])

In [10]:
merged_df = merged_df.fillna(-9999)

In [11]:
from sklearn.cluster import KMeans
import pandas as pd
 
num_clusters = [3, 5, 10, 15, 20]
remove_cols = ['Date', 'Store', 'Dept', 'train_or_test']
cols = list(merged_df.columns[merged_df.isnull().any() == False])

for remove_col in remove_cols:
    cols.remove(remove_col)

for num_cluster in num_clusters:
 
    KM = KMeans(n_clusters = num_cluster, random_state = 0, n_jobs = -1)
    KM.fit(merged_df.loc[:, cols])
    km_pred = KM.predict(merged_df.loc[:, cols])
    km_distance = KM.transform(merged_df.loc[:, cols])
    
    for clst in range(0, num_cluster):
        merged_df['Labels_{nums}_{clst}_distance'.format(nums=num_cluster, clst=clst)] = km_distance[:, clst]

    merged_df['Labels_{}'.format(num_cluster)] = km_pred
    merged_df =  Functions.createDateStatsFeatures(merged_df, 'Labels_{}_Sales_'.format(num_cluster), ['Store','Dept', 'Labels_{}'.format(num_cluster)])

    unique, count = np.unique(km_pred, return_counts=True)
    print('-'*10, num_cluster, ' cluster', '-'*10)
    for i in range(0, len(count)):
        print(unique[i], count[i])

---------- 3  cluster ----------
0 410439
1 22619
2 103576
---------- 5  cluster ----------
0 322081
1 22855
2 56239
3 5372
4 130087
---------- 10  cluster ----------
0 193413
1 17073
2 77698
3 11129
4 115888
5 24686
6 1790
7 34763
8 55463
9 4731
---------- 15  cluster ----------
0 50915
1 9514
2 112625
3 16610
4 32185
5 1469
6 23927
7 2394
8 12741
9 94787
10 5210
11 98164
12 67310
13 4880
14 3903
---------- 20  cluster ----------
0 89804
1 13367
2 1183
3 23390
4 5499
5 47590
6 109043
7 1408
8 3441
9 2975
10 2995
11 17711
12 65234
13 33666
14 7637
15 9926
16 5844
17 10104
18 84882
19 935


In [12]:
train = merged_df.loc[merged_df.train_or_test=='train', :]
test = merged_df.loc[merged_df.train_or_test=='test', :]

#### 学習

In [13]:
xgb_params =  {'n_estimators': [100],     
              'min_child_weight': np.arange(1, 10, 1),
              'max_depth': np.arange(3, 9, 1),
              'subsample': np.arange(0.6, 0.95, 0.05),
              'colsample_bytree': np.arange(0.6, 0.95, 0.05)}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rgr = xgb.XGBRegressor(random_state=2021)

train = train.drop(['Date', 'train_or_test'], axis=1)
test = test.drop(['Date', 'train_or_test'], axis=1)

pred = []
store_dept_list = test[['Store','Dept']].drop_duplicates().values.tolist()

for tempStore, tempDept in store_dept_list:
    print ('Store : ', tempStore, ' Dept :', tempDept)
    x_train = train[(train['Store'] == tempStore) & (train['Dept'] == tempDept)].drop(['Weekly_Sales'], axis=1)
    y_train = train[(train['Store'] == tempStore) & (train['Dept'] == tempDept)]['Weekly_Sales']
    x_test = test[(test['Store'] == tempStore) & (test['Dept'] == tempDept)].drop(['Weekly_Sales'], axis=1)
 
    if len(x_train) >= 5:

        rgr = RandomizedSearchCV(estimator=rgr, param_distributions = xgb_params, scoring='neg_root_mean_squared_error', 
                                 n_jobs=2, n_iter = 10, cv=5, verbose=5)
        rgr = rgr.fit(x_train, y_train)
        rgr = rgr.best_estimator_.fit(x_train, y_train)
        predict_test = rgr.predict(x_test)
        
        #fig, ax = plt.subplots(1, 1, figsize=(7, 25))
        #xgb.plot_importance(rgr,
        #                    importance_type='gain',
        #                    show_values=True,
        #                   ax=ax)
#
        #plt.show()

    elif len(x_train) >= 1:
        rgr = rgr.fit(x_train, y_train)
        predict_test = rgr.predict(x_test)
        
    else:
        predict_test = np.repeat(np.average(sales_train[sales_train['Dept'] == tempDept]['Weekly_Sales']), len(x_test))

    pred.extend(predict_test)

Store :  1  Dept : 1
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 2
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 3
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 4
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 5
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 6
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 7
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 8
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 9
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 10
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 11
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  Dept : 12
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Store :  1  D

In [None]:
ss = pd.read_csv('../data/raw_data/sampleSubmission.csv')
ss.loc[:, 'Weekly_Sales'] = pred
ss.to_csv('../files/submissions/XGB_rolling_52_104_0613_several_KM.csv', index=False)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 25))
xgb.plot_importance(rgr,
                    importance_type='gain',
                    show_values=True,
                   ax=ax)

plt.show()

In [None]:
from sklearn.cluster import KMeans
import pandas as pd
 
num_cluster=10 # cluster数
 
clusters = KMeans(n_clusters = num_cluster, random_state = 0, n_jobs = -1)
clusters.fit(train.loc[:, train.columns[train.isnull().any() == False]])
centers = clusters.cluster_centers_