In [27]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)

import warnings
warnings.filterwarnings("ignore")

In [28]:
path = "../input/pickle/"

train = pd.read_pickle(path + "train.zip")

train = train[train["item_category_big"]==7].copy().reset_index(drop=True)

train_x = train.drop(["y"], axis=1)
train_y = train["y"]
test_x = pd.read_pickle(path + "test.zip")

train.drop("item_category_big", axis=1, inplace=True)

test_x = test_x[test_x["item_category_big"]==7].copy()

test_x.drop("item_category_big", axis=1, inplace=True)

In [29]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109380 entries, 0 to 109379
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   year                   109380 non-null  int64  
 1   month                  109380 non-null  int64  
 2   shopID                 109380 non-null  int64  
 3   itemID                 109380 non-null  int64  
 4   item_categoryID        109380 non-null  int64  
 5   item_category_big      109380 non-null  int64  
 6   release_time           109380 non-null  float64
 7   sale_last_month        109380 non-null  float64
 8   item_price_last_month  109380 non-null  float64
 9   item_price_difference  109380 non-null  int64  
 10  item_price             109380 non-null  float64
dtypes: float64(4), int64(7)
memory usage: 9.2 MB


In [30]:
test_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 936 entries, 306 to 2987
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   itemID                 936 non-null    int64  
 1   shopID                 936 non-null    int64  
 2   item_categoryID        936 non-null    int64  
 3   month                  936 non-null    int64  
 4   year                   936 non-null    int64  
 5   release_time           936 non-null    int64  
 6   sale_last_month        936 non-null    float64
 7   item_price_last_month  936 non-null    float64
 8   item_price_difference  936 non-null    float64
 9   item_price             936 non-null    float64
dtypes: float64(4), int64(6)
memory usage: 80.4 KB


In [31]:
train_x.index

RangeIndex(start=0, stop=109380, step=1)

In [32]:
years = [2018] * 6 + [2019] * 9
months = list(range(7, 13)) + list(range(1, 10))

In [33]:
month_list = list(range(1, 7))
year_list = [2017] + [2018]

scores = []

# custom function to run light gbm model
def run_lgb(train_X, train_y, val_X, val_y, x_test):
    params = {"objective" : "regression", 
              "metric" : "rmse", 
              "n_estimators":20000, 
              "early_stopping_rounds":200,
              "num_leaves" : 31, 
              "learning_rate" : 0.01, 
              "bagging_fraction" : 0.7,
              "bagging_seed" : 0, 
              "num_threads" : 4,
              "colsample_bytree" : 0.7,
              'max_depth': 5
             }
    
    lgtrain = lgb.Dataset(tr_x, tr_y)
    lgval = lgb.Dataset(va_x, va_y)
    model = lgb.train(params, lgtrain, 
                      valid_sets=[lgtrain, lgval], 
                      verbose_eval=1000)
    
    pred_test_y = model.predict(x_test, num_iteration=model.best_iteration)
    return pred_test_y, model

for year, month in zip(years, months):
    if year == 2019 and len(year_list) == 2:
        year_list.append(year)
    
    month_list.append(month)
    
    if month == 12:
        year += 1
        month = 1
    
    tr_x = train_x[(train_x["year"].isin(year_list))&(train_x["month"].isin(month_list))]
    va_x = train_x[(train_x["year"] == year)&(train_x["month"] == month)]
    tr_y, va_y = train_y.iloc[tr_x.index], train_y.iloc[va_x.index]
    
    # Training the model
    pred_test, model = run_lgb(tr_x, tr_y, va_x, va_y, test_x)
    
    # バリデーションデータでのスコアの確認
    va_pred = model.predict(va_x)
    score = np.sqrt(mean_squared_error(va_y, va_pred))
    
    scores.append(score)

Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 1.25352	valid_1's rmse: 1.18575
[2000]	training's rmse: 1.15403	valid_1's rmse: 1.1311
[3000]	training's rmse: 1.1054	valid_1's rmse: 1.09718
[4000]	training's rmse: 1.07742	valid_1's rmse: 1.06874
[5000]	training's rmse: 1.04639	valid_1's rmse: 1.03741
[6000]	training's rmse: 1.02562	valid_1's rmse: 1.01636
[7000]	training's rmse: 1.01026	valid_1's rmse: 1.00288
[8000]	training's rmse: 0.997235	valid_1's rmse: 0.98838
[9000]	training's rmse: 0.98451	valid_1's rmse: 0.976594
[10000]	training's rmse: 0.974278	valid_1's rmse: 0.967264
[11000]	training's rmse: 0.964618	valid_1's rmse: 0.957876
[12000]	training's rmse: 0.955941	valid_1's rmse: 0.949311
[13000]	training's rmse: 0.949551	valid_1's rmse: 0.941634
[14000]	training's rmse: 0.943476	valid_1's rmse: 0.933799
[15000]	training's rmse: 0.938864	valid_1's rmse: 0.927575
[16000]	training's rmse: 0.930095	valid_1's rmse: 0.92112
[17000]	training's rms

In [34]:
scores

[0.8994019074470937,
 0.8536623699337109,
 0.7690119801532859,
 0.9786591668252524,
 1.0216943239205603,
 1.2019471335578538,
 0.898469274424511,
 0.7804761129905402,
 1.0268891354666343,
 1.0012170611590323,
 1.1620344789136503,
 1.3359671308445502,
 1.412193421255369,
 1.1394696701551368,
 0.8760122529992501]

In [35]:
# 予測
test_x["y"] = model.predict(test_x)

In [36]:
test_x.to_pickle('../input/pickle/test_music.zip')

In [37]:
# path = "../submission/"

# submission = pd.read_csv(path + "sample_submission.csv", header=None)

# submission.iloc[:,[1]] = pred
# submission.head()

In [38]:
# submission.to_csv(path + "submission.csv", index=False, header=None)