In [40]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)

import warnings
warnings.filterwarnings("ignore")

In [41]:
path = "../input/pickle/"

train = pd.read_pickle(path + "train.zip")

train = train[train["item_category_big"]==5].copy().reset_index(drop=True)
train.drop("item_category_big", axis=1, inplace=True)

train_x = train.drop(["y"], axis=1)
train_y = train["y"]
test_x = pd.read_pickle(path + "test.zip")

test_x = test_x[test_x["item_category_big"]==5].copy()

test_x.drop("item_category_big", axis=1, inplace=True)

In [42]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198555 entries, 0 to 198554
Data columns (total 10 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   year                   198555 non-null  int64  
 1   month                  198555 non-null  int64  
 2   shopID                 198555 non-null  int64  
 3   itemID                 198555 non-null  int64  
 4   item_categoryID        198555 non-null  int64  
 5   release_time           198555 non-null  float64
 6   sale_last_month        198555 non-null  float64
 7   item_price_last_month  198555 non-null  float64
 8   item_price_difference  198555 non-null  int64  
 9   item_price             198555 non-null  float64
dtypes: float64(4), int64(6)
memory usage: 15.1 MB


In [43]:
test_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 486 entries, 0 to 2033
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   itemID                 486 non-null    int64  
 1   shopID                 486 non-null    int64  
 2   item_categoryID        486 non-null    int64  
 3   month                  486 non-null    int64  
 4   year                   486 non-null    int64  
 5   release_time           486 non-null    int64  
 6   sale_last_month        486 non-null    float64
 7   item_price_last_month  486 non-null    float64
 8   item_price_difference  486 non-null    float64
 9   item_price             486 non-null    float64
dtypes: float64(4), int64(6)
memory usage: 41.8 KB


In [44]:
train_x.index

RangeIndex(start=0, stop=198555, step=1)

In [45]:
years = [2018] * 6 + [2019] * 9
months = list(range(7, 13)) + list(range(1, 10))

In [46]:
month_list = list(range(1, 7))
year_list = [2017] + [2018]

scores = []

# custom function to run light gbm model
def run_lgb(train_X, train_y, val_X, val_y, x_test):
    params = {"objective" : "regression", 
              "metric" : "rmse", 
              "n_estimators":20000, 
              "early_stopping_rounds":200,
              "num_leaves" : 31, 
              "learning_rate" : 0.01, 
              "bagging_fraction" : 0.7,
              "bagging_seed" : 0, 
              "num_threads" : 4,
              "colsample_bytree" : 0.7,
              'max_depth': 5
             }
    
    lgtrain = lgb.Dataset(tr_x, tr_y)
    lgval = lgb.Dataset(va_x, va_y)
    model = lgb.train(params, lgtrain, 
                      valid_sets=[lgtrain, lgval], 
                      verbose_eval=1000)
    
    pred_test_y = model.predict(x_test, num_iteration=model.best_iteration)
    return pred_test_y, model

for year, month in zip(years, months):
    if year == 2019 and len(year_list) == 2:
        year_list.append(year)
    
    month_list.append(month)
    
    if month == 12:
        year += 1
        month = 1
    
    tr_x = train_x[(train_x["year"].isin(year_list))&(train_x["month"].isin(month_list))]
    va_x = train_x[(train_x["year"] == year)&(train_x["month"] == month)]
    tr_y, va_y = train_y.iloc[tr_x.index], train_y.iloc[va_x.index]
    
    # Training the model
    pred_test, model = run_lgb(tr_x, tr_y, va_x, va_y, test_x)
    
    # バリデーションデータでのスコアの確認
    va_pred = model.predict(va_x)
    score = np.sqrt(mean_squared_error(va_y, va_pred))
    
    scores.append(score)

Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 2.86481	valid_1's rmse: 1.90296
[2000]	training's rmse: 2.65052	valid_1's rmse: 1.80192
[3000]	training's rmse: 2.50558	valid_1's rmse: 1.7507
[4000]	training's rmse: 2.41838	valid_1's rmse: 1.71336
[5000]	training's rmse: 2.34222	valid_1's rmse: 1.67541
[6000]	training's rmse: 2.29936	valid_1's rmse: 1.64728
[7000]	training's rmse: 2.24604	valid_1's rmse: 1.61155
[8000]	training's rmse: 2.18798	valid_1's rmse: 1.58187
[9000]	training's rmse: 2.14764	valid_1's rmse: 1.55517
[10000]	training's rmse: 2.10834	valid_1's rmse: 1.53784
[11000]	training's rmse: 2.05891	valid_1's rmse: 1.50626
[12000]	training's rmse: 2.03203	valid_1's rmse: 1.48671
[13000]	training's rmse: 2.0064	valid_1's rmse: 1.47081
[14000]	training's rmse: 1.9852	valid_1's rmse: 1.45535
[15000]	training's rmse: 1.96132	valid_1's rmse: 1.44454
[16000]	training's rmse: 1.93413	valid_1's rmse: 1.43628
[17000]	training's rmse: 1.90931	valid

In [47]:
scores

[1.401137315904138,
 1.564433759329293,
 2.062524196437948,
 2.392346590063118,
 2.083094576797465,
 2.826230213194018,
 1.9315048685632035,
 2.0318657126340187,
 2.7116981409275893,
 2.1641459991275713,
 1.4714126683049564,
 1.5621185327766287,
 1.3556794881094052,
 1.7194028579160034,
 1.609813722733601]

In [48]:
# 予測
test_x["y"] = model.predict(test_x)

In [49]:
test_x.to_pickle('../input/pickle/test_move.zip')

In [50]:
# path = "../submission/"

# submission = pd.read_csv(path + "sample_submission.csv", header=None)

# submission.iloc[:,[1]] = pred
# submission.head()

In [51]:
# submission.to_csv(path + "submission.csv", index=False, header=None)