In [1]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb

In [2]:
train = pd.read_csv('./train/train.csv')
test = pd.read_csv('./test/test.csv')

train.drop(['Name', 'Description', 'PetID'], axis=1, inplace=True)
test.drop(['Name', 'Description', 'PetID'], axis=1, inplace=True)

In [3]:
# make train, val data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(train, test_size=0.3, random_state=0)
df_train.shape, df_test.shape

((10495, 21), (4498, 21))

In [4]:
train_y = df_train['AdoptionSpeed']
train_X = df_train.drop(['AdoptionSpeed'], axis=1)

test_y = df_test['AdoptionSpeed']
test_X = df_test.drop(['AdoptionSpeed'], axis=1)

In [5]:
# categorical variable
numeric_cols = ['Age', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt']
cat_cols = list(set(train_X.columns) - set(numeric_cols))
for i in cat_cols:
    train_X.loc[:, i] = train_X[i].astype('category')
    test_X.loc[:, i] = test_X[i].astype('category')
print(train_X.shape)
print(test_X.shape)

# get categorical variable index

foo = train_X.dtypes
cat_feature_names = foo[foo == "category"]
cat_features = [train_X.columns.get_loc(c) for c in train_X.columns if c in cat_feature_names]

# y 타입 지정
train_y = train_y.astype('category')
test_y = test_y.astype('category')

(10495, 20)
(4498, 20)


In [6]:
# To load a numpy array into Dataset

d_train = lgb.Dataset(train_X, label=train_y)
d_valid = lgb.Dataset(test_X, label=test_y)

In [7]:
# LightGBM 파라미터 for classification
params_c = {'application': 'multiclass',
          'num_class':5,
          'metric': 'multi_logloss',
            }

# LightGBM 파라미터 for regression
params_r = {'application': 'regression',
           'metric': 'rmse',
            }

# Classification

In [8]:
model = lgb.train(params_c,
                      train_set = d_train,
                      valid_sets = [d_train, d_valid],
                      num_boost_round = 5,
                      verbose_eval = 1,
                      categorical_feature = cat_features,
                      early_stopping_rounds = 100)

[1]	training's multi_logloss: 1.44205	valid_1's multi_logloss: 1.44707
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 1.42033	valid_1's multi_logloss: 1.43204
[3]	training's multi_logloss: 1.40071	valid_1's multi_logloss: 1.41882
[4]	training's multi_logloss: 1.38357	valid_1's multi_logloss: 1.40799
[5]	training's multi_logloss: 1.36761	valid_1's multi_logloss: 1.39787
Did not meet early stopping. Best iteration is:
[5]	training's multi_logloss: 1.36761	valid_1's multi_logloss: 1.39787


In [9]:
pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
pred_test_y

array([[0.02779198, 0.19949399, 0.25291441, 0.21834876, 0.30145086],
       [0.02470831, 0.20102968, 0.2547208 , 0.20481533, 0.31472587],
       [0.03096301, 0.23691988, 0.27373753, 0.20767597, 0.2507036 ],
       ...,
       [0.03346714, 0.21767494, 0.2443351 , 0.21851824, 0.28600459],
       [0.03176305, 0.23556865, 0.27336912, 0.205858  , 0.25344119],
       [0.02464185, 0.20210166, 0.26672457, 0.20217954, 0.30435238]])

In [10]:
np.argmax(pred_test_y, axis=1)

array([4, 4, 2, ..., 4, 2, 4], dtype=int64)

# Regression

In [11]:
model = lgb.train(params_r,
                      train_set = d_train,
                      valid_sets = [d_train, d_valid],
                      num_boost_round = 5, # number of boosting iterations
                      verbose_eval = 1, # n번마다 print 
                      categorical_feature = cat_features,
                      early_stopping_rounds = 100)

[1]	training's rmse: 1.15927	valid_1's rmse: 1.15637
Training until validation scores don't improve for 100 rounds.
[2]	training's rmse: 1.14247	valid_1's rmse: 1.14263
[3]	training's rmse: 1.1282	valid_1's rmse: 1.13159
[4]	training's rmse: 1.11573	valid_1's rmse: 1.12198
[5]	training's rmse: 1.10404	valid_1's rmse: 1.11376
Did not meet early stopping. Best iteration is:
[5]	training's rmse: 1.10404	valid_1's rmse: 1.11376


In [12]:
pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
pred_test_y

array([2.59686384, 2.61883125, 2.28940823, ..., 2.40792863, 2.34586271,
       2.46956264])