In [28]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)

import warnings
warnings.filterwarnings("ignore")

In [29]:
path = "../input/pickle/"

train = pd.read_pickle(path + "train.zip")

train = train[train["item_category_big"]==4].copy().reset_index(drop=True)

train_x = train.drop(["y"], axis=1)
train_y = train["y"]
test_x = pd.read_pickle(path + "test.zip")

train.drop("item_category_big", axis=1, inplace=True)

test_x = test_x[test_x["item_category_big"]==4].copy()

test.drop("item_category_big", axis=1, inplace=True)

In [19]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5887 entries, 0 to 5886
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               5887 non-null   int64  
 1   month              5887 non-null   int64  
 2   shopID             5887 non-null   int64  
 3   itemID             5887 non-null   int64  
 4   item_categoryID    5887 non-null   int64  
 5   item_category_big  5887 non-null   int64  
 6   item_price         5887 non-null   int64  
 7   release_time       5887 non-null   float64
dtypes: float64(1), int64(7)
memory usage: 368.1 KB


In [20]:
test_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   itemID             90 non-null     int64  
 1   shopID             90 non-null     int64  
 2   item_categoryID    90 non-null     int64  
 3   item_category_big  90 non-null     int64  
 4   month              90 non-null     int64  
 5   year               90 non-null     int64  
 6   item_price         90 non-null     float64
 7   release_time       90 non-null     int64  
dtypes: float64(1), int64(7)
memory usage: 5.8 KB


In [21]:
train_x.index

RangeIndex(start=0, stop=5887, step=1)

In [22]:
years = [2018] * 6 + [2019] * 9
months = list(range(7, 13)) + list(range(1, 10))

In [23]:
month_list = list(range(1, 7))
year_list = [2017] + [2018]

scores = []

# custom function to run light gbm model
def run_lgb(train_X, train_y, val_X, val_y, x_test):
    params = {"objective" : "regression", 
              "metric" : "rmse", 
              "n_estimators":20000, 
              "early_stopping_rounds":200,
              "num_leaves" : 31, 
              "learning_rate" : 0.01, 
              "bagging_fraction" : 0.7,
              "bagging_seed" : 0, 
              "num_threads" : 4,
              "colsample_bytree" : 0.7,
              'max_depth': 5
             }
    
    lgtrain = lgb.Dataset(tr_x, tr_y)
    lgval = lgb.Dataset(va_x, va_y)
    model = lgb.train(params, lgtrain, 
                      valid_sets=[lgtrain, lgval], 
                      verbose_eval=1000)
    
    pred_test_y = model.predict(x_test, num_iteration=model.best_iteration)
    return pred_test_y, model

for year, month in zip(years, months):
    if year == 2019 and len(year_list) == 1:
        year_list = year_list.append(year)
    
    month_list.append(month)
    
    if month == 12:
        year += 1
        month = 1
    
    tr_x = train_x[(train_x["year"].isin(year_list))&(train_x["month"].isin(month_list))]
    va_x = train_x[(train_x["year"] == year)&(train_x["month"] == month)]
    tr_y, va_y = train_y.iloc[tr_x.index], train_y.iloc[va_x.index]
    
    # Training the model
    pred_test, model = run_lgb(tr_x, tr_y, va_x, va_y, test_x)
    
    # バリデーションデータでのスコアの確認
    va_pred = model.predict(va_x)
    score = np.sqrt(mean_squared_error(va_y, va_pred))
    
    scores.append(score)

Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 1.42343	valid_1's rmse: 1.63051
[2000]	training's rmse: 1.29261	valid_1's rmse: 1.42618
[3000]	training's rmse: 1.20871	valid_1's rmse: 1.30625
[4000]	training's rmse: 1.13628	valid_1's rmse: 1.21192
[5000]	training's rmse: 1.08185	valid_1's rmse: 1.14764
[6000]	training's rmse: 1.04089	valid_1's rmse: 1.09301
[7000]	training's rmse: 1.00105	valid_1's rmse: 1.0437
[8000]	training's rmse: 0.965444	valid_1's rmse: 1.00241
[9000]	training's rmse: 0.936372	valid_1's rmse: 0.957988
[10000]	training's rmse: 0.912181	valid_1's rmse: 0.924949
[11000]	training's rmse: 0.886471	valid_1's rmse: 0.895264
[12000]	training's rmse: 0.861446	valid_1's rmse: 0.865939
[13000]	training's rmse: 0.832887	valid_1's rmse: 0.841915
[14000]	training's rmse: 0.814362	valid_1's rmse: 0.822988
[15000]	training's rmse: 0.798911	valid_1's rmse: 0.798733
[16000]	training's rmse: 0.781295	valid_1's rmse: 0.774173
[17000]	training's 

In [24]:
scores

[0.694944823486755,
 0.4910364092794752,
 0.5758286614769232,
 1.043867531762851,
 0.6407284882876945,
 2.806485223328105,
 2.806485223328105,
 2.1635543094742546,
 1.777591060614494,
 1.6910521235899707,
 1.7452791203501687,
 1.9134084684488084,
 1.8895994309434612,
 2.08197349466302,
 1.5371162937765075]

In [25]:
# 予測
test_x[y] = model.predict(test_x)

In [65]:
path = "../submission/"

submission = pd.read_csv(path + "sample_submission.csv", header=None)

submission.iloc[:,[1]] = pred
submission.head()

Unnamed: 0,0,1
0,0,1.598766
1,1,1.602905
2,2,1.598766
3,3,1.588539
4,4,1.588539


In [66]:
submission.to_csv(path + "submission.csv", index=False, header=None)