In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

import lightgbm as lgb
from optuna.integration import lightgbm as lgb_optuna

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold,cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer


In [None]:
!dir

In [None]:
train = pd.read_csv('train_20200924.csv')
test = pd.read_csv('test_20200924.csv')

In [None]:
train

In [None]:
train.info()

In [None]:
train[train['koutou'] >= 0]['koutou'].plot.hist()
plt.show()

train[train['koutou'] >= 0]['koutou'].plot.box()
plt.show()

train[train['koutou'] >= 0]['koutou'].describe()

In [None]:
def decide_y(x):
#     if x < 50:
#         return 0
#     else:
#         return 1
    if x < 0:
        return 0
    elif x >= 15.0:
        return 1
    elif x >= 53.84:
        return 2
    elif x >= 125.0:
        return 3
    else:
        return 4

In [None]:
train['y'] = train['koutou'].apply(decide_y)
train[['koutou', 'y']]

In [None]:
all_df = pd.concat([train, test], axis=0).reset_index(drop=True)

In [None]:
all_df

In [None]:
all_df = all_df.drop('koutou', axis=1)

In [None]:
all_df['y'] = all_df['y'].fillna(-999)

In [None]:
all_df['year'] = all_df['date'].apply(lambda x: x.split('/')[0]).astype('int')
all_df['month'] = all_df['date'].apply(lambda x: x.split('/')[1]).astype('int')
all_df['day'] = all_df['date'].apply(lambda x: x.split('/')[2]).astype('int')
all_df

In [None]:
all_df.hist( figsize=(14, 10), bins=20)

In [None]:
all_df.describe()

In [None]:
all_df.corr()['y'].abs().sort_values(ascending=False)[1:]

In [None]:
all_df.info()

In [None]:
categorical_features = ['market_name', 'market_key', 'kessan', 'gyosyu', 'v_tani']

In [None]:
all_df = all_df.drop(['name', 'date', 'code', 'p_kari', 'p_uri', 'bb', 'content'], axis=1)

In [None]:
for col in categorical_features:
    lbl = LabelEncoder()
    lbl.fit(all_df[col])
    lbl.transform(all_df[col])
    all_df[col] = lbl.transform(all_df[col])

In [None]:
all_df

In [None]:
#訓練データ、テストデータの分割
train_df = all_df[all_df["y"] != -999.0]
test_df = all_df[all_df["y"] == -999.0]


In [None]:
y_train = train_df['y']
X_train = train_df.drop(['y'], axis=1)
X_test = test_df.drop(['y'], axis=1)

In [None]:
y_train.shape, X_train.shape, X_test.shape

In [None]:
# 訓練データからデータを分割
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

In [None]:
# 使用モデルはLGB（パラメータチューニング無）
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

params = {
#     'objective': 'binary'
    'objective': 'multiclass', 
    'num_class': 5,
}

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
y_pred

In [None]:
X_test

In [None]:
def build():
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    lgb_train = lgb_optuna.Dataset(X_train, y_train)

    lgbm_params = {
#         'objective': 'binary',
#         'metric': 'binary_logloss',
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': 5,
        'random_state':0,
        'verbosity': 0
    }

    tunecv = lgb_optuna.LightGBMTunerCV(
        lgbm_params,
        lgb_train,
        num_boost_round=100,
        early_stopping_rounds=20,
        seed = 0,
        verbose_eval=20,
        folds=kf
    )

    tunecv.run()

    print( 'Best score = ',tunecv.best_score)
    print( 'Best params= ',tunecv.best_params)

    return tunecv

tunecv = build()

In [None]:
# train_data = lgb.Dataset( X_train, y_train )
# eval_data = lgb.Dataset(X_valid, label=X_valid, reference= train_data)

lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

clf = lgb.train( tunecv.best_params, 
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=50,
                verbose_eval=0
               )
y_pred = clf.predict( X_valid )
# print('AUC: ', roc_auc_score(y_valid, y_pred))

In [None]:
clf.predict( X_test )

In [None]:
test