In [27]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)

import warnings
warnings.filterwarnings("ignore")

In [28]:
path = "../input/pickle/"

train = pd.read_pickle(path + "train.zip")

train = train[train["item_category_big"]==2].copy().reset_index(drop=True)

train_x = train.drop(["y"], axis=1)
train_y = train["y"]
test_x = pd.read_pickle(path + "test.zip")

train.drop("item_category_big", axis=1, inplace=True)

test_x = test_x[test_x["item_category_big"]==2].copy()

test_x.drop("item_category_big", axis=1, inplace=True)

In [29]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60764 entries, 0 to 60763
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   60764 non-null  int64  
 1   month                  60764 non-null  int64  
 2   shopID                 60764 non-null  int64  
 3   itemID                 60764 non-null  int64  
 4   item_categoryID        60764 non-null  int64  
 5   item_category_big      60764 non-null  int64  
 6   release_time           60764 non-null  float64
 7   sale_last_month        60764 non-null  float64
 8   item_price_last_month  60764 non-null  float64
 9   item_price_difference  60764 non-null  int64  
 10  item_price             60764 non-null  float64
dtypes: float64(4), int64(7)
memory usage: 5.1 MB


In [30]:
test_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1170 entries, 1206 to 2879
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   itemID                 1170 non-null   int64  
 1   shopID                 1170 non-null   int64  
 2   item_categoryID        1170 non-null   int64  
 3   month                  1170 non-null   int64  
 4   year                   1170 non-null   int64  
 5   release_time           1170 non-null   int64  
 6   sale_last_month        1170 non-null   float64
 7   item_price_last_month  1170 non-null   float64
 8   item_price_difference  1170 non-null   float64
 9   item_price             1170 non-null   float64
dtypes: float64(4), int64(6)
memory usage: 100.5 KB


In [31]:
train_x.index

RangeIndex(start=0, stop=60764, step=1)

In [32]:
years = [2018] * 6 + [2019] * 9
months = list(range(7, 13)) + list(range(1, 10))

In [33]:
month_list = list(range(1, 7))
year_list = [2017] + [2018]

scores = []

# custom function to run light gbm model
def run_lgb(train_X, train_y, val_X, val_y, x_test):
    params = {"objective" : "regression", 
              "metric" : "rmse", 
              "n_estimators":20000, 
              "early_stopping_rounds":200,
              "num_leaves" : 31, 
              "learning_rate" : 0.01, 
              "bagging_fraction" : 0.7,
              "bagging_seed" : 0, 
              "num_threads" : 4,
              "colsample_bytree" : 0.7,
              'max_depth': 5
             }
    
    lgtrain = lgb.Dataset(tr_x, tr_y)
    lgval = lgb.Dataset(va_x, va_y)
    model = lgb.train(params, lgtrain, 
                      valid_sets=[lgtrain, lgval], 
                      verbose_eval=1000)
    
    pred_test_y = model.predict(x_test, num_iteration=model.best_iteration)
    return pred_test_y, model

for year, month in zip(years, months):
    if year == 2019 and len(year_list) == 2:
        year_list.append(year)
    
    month_list.append(month)
    
    if month == 12:
        year += 1
        month = 1
    
    tr_x = train_x[(train_x["year"].isin(year_list))&(train_x["month"].isin(month_list))]
    va_x = train_x[(train_x["year"] == year)&(train_x["month"] == month)]
    tr_y, va_y = train_y.iloc[tr_x.index], train_y.iloc[va_x.index]
    
    # Training the model
    pred_test, model = run_lgb(tr_x, tr_y, va_x, va_y, test_x)
    
    # バリデーションデータでのスコアの確認
    va_pred = model.predict(va_x)
    score = np.sqrt(mean_squared_error(va_y, va_pred))
    
    scores.append(score)

Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 2.12765	valid_1's rmse: 1.99925
[2000]	training's rmse: 1.98032	valid_1's rmse: 1.86629
[3000]	training's rmse: 1.8877	valid_1's rmse: 1.7619
[4000]	training's rmse: 1.83029	valid_1's rmse: 1.70514
[5000]	training's rmse: 1.78484	valid_1's rmse: 1.6634
[6000]	training's rmse: 1.7458	valid_1's rmse: 1.61237
[7000]	training's rmse: 1.71089	valid_1's rmse: 1.5668
[8000]	training's rmse: 1.6799	valid_1's rmse: 1.53329
[9000]	training's rmse: 1.65274	valid_1's rmse: 1.50269
[10000]	training's rmse: 1.62623	valid_1's rmse: 1.46899
[11000]	training's rmse: 1.60121	valid_1's rmse: 1.44086
[12000]	training's rmse: 1.57863	valid_1's rmse: 1.41403
[13000]	training's rmse: 1.55888	valid_1's rmse: 1.39146
[14000]	training's rmse: 1.53802	valid_1's rmse: 1.37089
[15000]	training's rmse: 1.51816	valid_1's rmse: 1.34842
[16000]	training's rmse: 1.50097	valid_1's rmse: 1.32764
[17000]	training's rmse: 1.48399	valid_1'

In [34]:
scores

[1.2483931988258186,
 1.392086182507728,
 1.3425670288177705,
 2.3819119423916324,
 1.5187477013706838,
 2.912041355940584,
 1.8081668500008106,
 1.6243504720837676,
 1.6134501429587633,
 1.6731014509705766,
 1.4885300233052425,
 1.3551959718612103,
 1.4213307926474623,
 1.7501217022055127,
 1.839562202145546]

In [35]:
# 予測
test_x["y"] = model.predict(test_x)

In [36]:
test_x.to_pickle('../input/pickle/test_goods.zip')

In [37]:
# path = "../submission/"

# submission = pd.read_csv(path + "sample_submission.csv", header=None)

# submission.iloc[:,[1]] = pred
# submission.head()

In [38]:
# submission.to_csv(path + "submission.csv", index=False, header=None)