### 使用するライラリ

In [1]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

### 作成済みDataFrameの読み込み

In [2]:
DF_FILE_PATH = '../input/pickle/df_baseline_ver1.pickle.gz'
EVENT_FILE_PATH = '../input/pickle/df_event.pickle.gz'
ITEM_STARTEND_FILE_PATH = '../input/pickle/df_id_d_item_startend.pickle.gz'
SAMPLE_SUBMIT_PATH = '../input/csv/sample_submission.csv.gz'

In [3]:
df = pd.read_pickle(DF_FILE_PATH)
event = pd.read_pickle(EVENT_FILE_PATH)
item = pd.read_pickle(ITEM_STARTEND_FILE_PATH)

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
df = reduce_mem_usage(df)
event = reduce_mem_usage(event)
item = reduce_mem_usage(item)
gc.collect()

Mem. usage decreased to 4290.31 Mb (0.0% reduction)
Mem. usage decreased to  0.09 Mb (39.5% reduction)
Mem. usage decreased to 636.20 Mb (0.0% reduction)


0

In [6]:
event['date'] = pd.to_datetime(event['date'])

In [7]:
df = df.merge(event, how='left', on=['date'])
del event
gc.collect()

40

In [8]:
df = df.assign(id=df.id.str.replace("_validation", ""))

In [9]:
df = df.merge(item, how='inner', on=['id', 'd'])
del item
gc.collect()

53

In [10]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,demand,part,date,...,PresidentsDay,Purim End,Ramadan starts,StPatricksDay,SuperBowl,Thanksgiving,ValentinesDay,VeteransDay,days_from_release,days_from_finalsales
0,HOUSEHOLD_1_514_WI_3,2506,5,2,9,2,902,16,train,2013-07-18,...,0,0,70,0,0,0,0,0,328,0
1,FOODS_3_169_WI_3,781,2,0,9,2,902,0,train,2013-07-18,...,0,0,70,0,0,0,0,0,782,0
2,FOODS_3_168_WI_3,780,2,0,9,2,902,0,train,2013-07-18,...,0,0,70,0,0,0,0,0,891,0
3,FOODS_3_165_WI_3,777,2,0,9,2,902,1,train,2013-07-18,...,0,0,70,0,0,0,0,0,879,0
4,FOODS_3_163_WI_3,775,2,0,9,2,902,0,train,2013-07-18,...,0,0,70,0,0,0,0,0,901,0


In [11]:
df.describe()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,d,demand,wm_yr_wk,event_name_1,event_type_1,...,PresidentsDay,Purim End,Ramadan starts,StPatricksDay,SuperBowl,Thanksgiving,ValentinesDay,VeteransDay,days_from_release,days_from_finalsales
count,29926990.0,29926990.0,29926990.0,29926990.0,29926990.0,29926990.0,29926990.0,29926990.0,2426986.0,2426986.0,...,29926990.0,29926990.0,29926990.0,29926990.0,29926990.0,29926990.0,29926990.0,29926990.0,29926990.0,29926990.0
mean,1529.958,3.17072,0.877179,4.52437,0.9071364,1439.923,1.262374,11470.07,,,...,8.9197,8.980325,6.967679,8.985097,8.886959,8.773019,8.913274,8.759878,1089.006,1.159946
std,879.9425,1.917944,0.8923588,2.869064,0.8301622,296.2202,3.82438,84.18057,0.0,0.0,...,22.633,22.70092,19.73341,22.70557,22.59887,22.47518,22.62605,22.46288,487.5438,15.27298
min,0.0,0.0,0.0,0.0,0.0,902.0,0.0,11325.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,775.0,2.0,0.0,2.0,0.0,1189.0,0.0,11413.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,744.0,0.0
50%,1533.0,3.0,1.0,5.0,1.0,1448.0,0.0,11450.0,15.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1121.0,0.0
75%,2291.0,5.0,2.0,7.0,2.0,1696.0,1.0,11534.0,22.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1469.0,0.0
max,3048.0,6.0,2.0,9.0,2.0,1941.0,763.0,11617.0,29.0,3.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,1940.0,1690.0


### カテゴリ型コラムと未使用コラムを指定

In [12]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id']\
            + ['event_type_1', 'event_type_2']\
            + ['quarter', 'month', 'week', 'dayofweek']
useless_cols = ['id', 'date', 'demand','d', 'part', 'wm_yr_wk', 'weekday', 'day', 'event_name_1', 'event_name_2']
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[df['part']=='train'][train_cols]
y_train = df[df['part']=='train']['demand']
X_test = df[df['part']=='validation'][train_cols]
test = df[df['part']=='validation'][['id', 'd']].reset_index(drop=True)

In [13]:
gc.collect()

40

### 不用データ削除

In [14]:
df['rolling_mean_t7'].dropna(inplace=True)

### 学習時のvalid_dataをランダムチョイスで作成

In [15]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

Wall time: 39.9 s


In [16]:
# train_data = lgb.Dataset(X_train , label = y_train, 
#                          categorical_feature=cat_feats, free_raw_data=False)
# fake_valid_data = lgb.Dataset(X_test, label = y_test,
#                               categorical_feature=cat_feats, free_raw_data=False)

In [17]:
del df, X_train, y_train, fake_valid_inds,train_inds
gc.collect()

0

In [18]:
params = {
        "objective" : "poisson",
        "metric" :"mse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
        "nthread" : 3,
        "metric": ["mse"],
        "verbosity": 1,
        "num_iterations" : 3000,
        "num_leaves": 128,
        "min_data_in_leaf": 100,
}

In [19]:
%%time
m_lgb = lgb.train(params, 
                  train_data, 
                  valid_sets = [fake_valid_data], 
                  verbose_eval=10, 
                  early_stopping_rounds=100,
                 ) 

Training until validation scores don't improve for 100 rounds
[10]	valid_0's l2: 9.98733
[20]	valid_0's l2: 8.04861
[30]	valid_0's l2: 7.13838
[40]	valid_0's l2: 6.69726
[50]	valid_0's l2: 6.49247
[60]	valid_0's l2: 6.37624
[70]	valid_0's l2: 6.31256
[80]	valid_0's l2: 6.27487
[90]	valid_0's l2: 6.22999
[100]	valid_0's l2: 6.18023
[110]	valid_0's l2: 6.13952
[120]	valid_0's l2: 6.09599
[130]	valid_0's l2: 6.05706
[140]	valid_0's l2: 6.01643
[150]	valid_0's l2: 5.97322
[160]	valid_0's l2: 5.93322
[170]	valid_0's l2: 5.90436
[180]	valid_0's l2: 5.8736
[190]	valid_0's l2: 5.83839
[200]	valid_0's l2: 5.80985
[210]	valid_0's l2: 5.79268
[220]	valid_0's l2: 5.76927
[230]	valid_0's l2: 5.75105
[240]	valid_0's l2: 5.73138
[250]	valid_0's l2: 5.71119
[260]	valid_0's l2: 5.68858
[270]	valid_0's l2: 5.6632
[280]	valid_0's l2: 5.63493
[290]	valid_0's l2: 5.61063
[300]	valid_0's l2: 5.58142
[310]	valid_0's l2: 5.55915
[320]	valid_0's l2: 5.54881
[330]	valid_0's l2: 5.53641
[340]	valid_0's l2: 5.516

[2870]	valid_0's l2: 4.62425
[2880]	valid_0's l2: 4.62342
[2890]	valid_0's l2: 4.62247
[2900]	valid_0's l2: 4.62185
[2910]	valid_0's l2: 4.62107
[2920]	valid_0's l2: 4.62023
[2930]	valid_0's l2: 4.61603
[2940]	valid_0's l2: 4.61559
[2950]	valid_0's l2: 4.61513
[2960]	valid_0's l2: 4.61316
[2970]	valid_0's l2: 4.61189
[2980]	valid_0's l2: 4.61136
[2990]	valid_0's l2: 4.61126
[3000]	valid_0's l2: 4.61096
Did not meet early stopping. Best iteration is:
[3000]	valid_0's l2: 4.61096
Wall time: 3h 56min 41s


In [20]:
m_lgb.save_model("model_v1.1.lgb")

<lightgbm.basic.Booster at 0x21326eae448>

In [21]:
#m_lgb = lgb.Booster(model_file="model_v1.1.lgb")

### 学習ここまで

### 予測ここから

In [35]:
X_test['days_from_finalsales'] = X_test['days_from_finalsales'] - 28
X_test['days_from_finalsales'][X_test['days_from_finalsales']<0] = 0

In [38]:
y_pred = m_lgb.predict(X_test)

In [39]:
test = test.drop(['demand'], axis=1)

In [40]:
test = pd.concat([test, pd.Series(y_pred, name='demand')], axis = 1)

In [24]:
#idにvalidationやevaluationなければつける
test = test.assign(id=test.id + "_" + np.where(test.d <= 1941, "validation", "evaluation"),
                   F="F" + (test.d - 1913 - 28 * (test.d > 1941)).astype("str"))

In [41]:
sample_submission = pd.read_csv(SAMPLE_SUBMIT_PATH)

In [42]:
#test = test.assign(F="F" + (test.d - 1913 - 28 * (test.d > 1941)).astype("str"))
submission = test.pivot(index="id", columns="F", values="demand").reset_index()[sample_submission.columns]
submission = sample_submission[["id"]].merge(submission, how="left", on="id")
submission.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.997448,0.91668,0.899416,0.846732,0.931522,1.179975,1.12972,0.881976,0.865523,...,0.896212,1.085358,1.102187,0.826357,0.686177,0.688445,0.680428,0.755022,1.045711,0.963125
1,HOBBIES_1_002_CA_1_validation,0.389787,0.410011,0.41346,0.405841,0.498579,0.582792,0.479432,0.387982,0.35543,...,0.400672,0.520611,0.473675,0.355127,0.281796,0.321569,0.00018,0.000203,0.000196,0.000165
2,HOBBIES_1_003_CA_1_validation,0.740462,0.709946,0.723436,0.738702,0.950864,1.154118,0.821787,0.43127,0.452925,...,0.6179,0.987712,0.741451,0.683089,0.577314,0.565565,0.537958,0.652265,0.85824,0.769553
3,HOBBIES_1_004_CA_1_validation,1.999201,1.635594,1.647376,1.55964,1.925496,2.653507,2.5831,1.802693,1.693628,...,1.932855,2.326761,2.573399,2.024312,1.786658,1.657911,1.59102,1.966279,2.636385,2.796115
4,HOBBIES_1_005_CA_1_validation,1.232907,1.167896,1.09256,1.089619,1.29736,1.484925,1.650136,1.185971,1.158438,...,1.363683,1.618935,1.534525,1.187529,0.938552,0.914494,0.908115,1.077579,1.347277,1.556279


In [45]:
submission.to_csv('team_sun_submission3_lightgbm_model_v1.1.csv', index=False)

In [44]:
X_test.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,event_type_1,event_type_2,snap_CA,snap_TX,snap_WI,...,PresidentsDay,Purim End,Ramadan starts,StPatricksDay,SuperBowl,Thanksgiving,ValentinesDay,VeteransDay,days_from_release,days_from_finalsales
29073274,419,1,0,6,1,,,0,0,0,...,0,0,0,0,0,0,0,0,1543,0
29073275,420,1,0,6,1,,,0,0,0,...,0,0,0,0,0,0,0,0,1505,0
29073276,421,1,0,6,1,,,0,0,0,...,0,0,0,0,0,0,0,0,1905,0
29073277,422,1,0,6,1,,,0,0,0,...,0,0,0,0,0,0,0,0,1905,0
29073278,428,1,0,6,1,,,0,0,0,...,0,0,0,0,0,0,0,0,768,47


In [46]:
X_test.head().to_csv('X_test_head.csv', index=False)

In [47]:
y_pred

array([5.49144283e-01, 9.61937666e-01, 2.31223553e-01, ...,
       4.00877669e-01, 1.34338799e-04, 2.90673271e-04])