In [27]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)

import warnings
warnings.filterwarnings("ignore")

In [28]:
path = "../input/pickle/"

train = pd.read_pickle(path + "train.zip")

train = train[train["item_category_big"].isin([0,1])].copy().reset_index(drop=True)

train_x = train.drop(["y"], axis=1)
train_y = train["y"]
test_x = pd.read_pickle(path + "test.zip")

test_x = test_x[test_x["item_category_big"].isin([0,1])].copy()

test_x.drop("item_category_big", axis=1, inplace=True)

In [29]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42474 entries, 0 to 42473
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   42474 non-null  int64  
 1   month                  42474 non-null  int64  
 2   shopID                 42474 non-null  int64  
 3   itemID                 42474 non-null  int64  
 4   item_categoryID        42474 non-null  int64  
 5   item_category_big      42474 non-null  int64  
 6   release_time           42474 non-null  float64
 7   sale_last_month        42474 non-null  float64
 8   item_price_last_month  42474 non-null  float64
 9   item_price_difference  42474 non-null  int64  
 10  item_price             42474 non-null  float64
dtypes: float64(4), int64(7)
memory usage: 3.6 MB


In [30]:
test_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72 entries, 270 to 1763
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   itemID                 72 non-null     int64  
 1   shopID                 72 non-null     int64  
 2   item_categoryID        72 non-null     int64  
 3   month                  72 non-null     int64  
 4   year                   72 non-null     int64  
 5   release_time           72 non-null     int64  
 6   sale_last_month        72 non-null     float64
 7   item_price_last_month  72 non-null     float64
 8   item_price_difference  72 non-null     float64
 9   item_price             72 non-null     float64
dtypes: float64(4), int64(6)
memory usage: 6.2 KB


In [31]:
train_x.index

RangeIndex(start=0, stop=42474, step=1)

In [32]:
years = [2018] * 6 + [2019] * 9
months = list(range(7, 13)) + list(range(1, 10))

In [33]:
month_list = list(range(1, 7))
year_list = [2017] + [2018]

scores = []

# custom function to run light gbm model
def run_lgb(train_X, train_y, val_X, val_y, x_test):
    params = {"objective" : "regression", 
              "metric" : "rmse", 
              "n_estimators":20000, 
              "early_stopping_rounds":200,
              "num_leaves" : 31, 
              "learning_rate" : 0.01, 
              "bagging_fraction" : 0.7,
              "bagging_seed" : 0, 
              "num_threads" : 4,
              "colsample_bytree" : 0.7,
              'max_depth': 5
             }
    
    lgtrain = lgb.Dataset(tr_x, tr_y)
    lgval = lgb.Dataset(va_x, va_y)
    model = lgb.train(params, lgtrain, 
                      valid_sets=[lgtrain, lgval], 
                      verbose_eval=1000)
    
    pred_test_y = model.predict(x_test, num_iteration=model.best_iteration)
    return pred_test_y, model

for year, month in zip(years, months):
    if year == 2019 and len(year_list) == 2:
        year_list.append(year)
    
    month_list.append(month)
    
    if month == 12:
        year += 1
        month = 1
    
    tr_x = train_x[(train_x["year"].isin(year_list))&(train_x["month"].isin(month_list))]
    va_x = train_x[(train_x["year"] == year)&(train_x["month"] == month)]
    tr_y, va_y = train_y.iloc[tr_x.index], train_y.iloc[va_x.index]
    
    # Training the model
    pred_test, model = run_lgb(tr_x, tr_y, va_x, va_y, test_x)
    
    # バリデーションデータでのスコアの確認
    va_pred = model.predict(va_x)
    score = np.sqrt(mean_squared_error(va_y, va_pred))
    
    scores.append(score)

Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 5.19523	valid_1's rmse: 4.33309
[2000]	training's rmse: 4.41896	valid_1's rmse: 3.54778
[3000]	training's rmse: 4.02905	valid_1's rmse: 3.16096
[4000]	training's rmse: 3.82238	valid_1's rmse: 2.98206
[5000]	training's rmse: 3.64267	valid_1's rmse: 2.81925
[6000]	training's rmse: 3.50066	valid_1's rmse: 2.72192
[7000]	training's rmse: 3.35581	valid_1's rmse: 2.63861
[8000]	training's rmse: 3.22608	valid_1's rmse: 2.57083
[9000]	training's rmse: 3.12894	valid_1's rmse: 2.50479
[10000]	training's rmse: 3.03403	valid_1's rmse: 2.4538
[11000]	training's rmse: 2.92685	valid_1's rmse: 2.37192
[12000]	training's rmse: 2.84491	valid_1's rmse: 2.30287
[13000]	training's rmse: 2.7814	valid_1's rmse: 2.25458
[14000]	training's rmse: 2.71063	valid_1's rmse: 2.21495
[15000]	training's rmse: 2.64613	valid_1's rmse: 2.17586
[16000]	training's rmse: 2.59666	valid_1's rmse: 2.13109
[17000]	training's rmse: 2.54124	vali

In [34]:
scores

[1.9900157367240736,
 2.1242259998796262,
 2.1911500062385088,
 3.301957230731283,
 3.684408824981244,
 5.441621988303744,
 2.4922983473723,
 2.1154927858879096,
 2.3670736414452858,
 2.285132371705558,
 2.4996815176548046,
 3.8088752852236922,
 2.3435109174274116,
 2.2276171140520393,
 2.3953156213835456]

In [35]:
# 予測
test_x["y"] = model.predict(test_x)

In [36]:
test_x.to_pickle('../input/pickle/test_pc_game.zip')

In [37]:
# path = "../submission/"

# submission = pd.read_csv(path + "sample_submission.csv", header=None)

# submission.iloc[:,[1]] = pred
# submission.head()

In [38]:
# submission.to_csv(path + "submission.csv", index=False, header=None)