In [1]:
import numpy as np
import pandas as pd
import warnings
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
import lightgbm as lgb
from sklearn.decomposition import PCA


warnings.filterwarnings(action='ignore') # 경고 문구 생략

from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:
# 데이터 경로
data_path = '/content/drive/MyDrive/RoboAdviser_Project/kaggle_timeseries/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')


In [3]:
sales_train = sales_train.rename(columns={'date': '날짜',
                                          'date_block_num': '월ID',
                                          'shop_id': '상점ID',
                                          'item_id': '상품ID',
                                          'item_price': '판매가',
                                          'item_cnt_day': '판매량'})

sales_train.head()

Unnamed: 0,날짜,월ID,상점ID,상품ID,판매가,판매량
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
shops = shops.rename(columns={'shop_name': '상점명',
                              'shop_id': '상점ID'})

shops.head()

Unnamed: 0,상점명,상점ID
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [5]:
items = items.rename(columns={'item_name': '상품명',
                              'item_id': '상품ID',
                              'item_category_id': '상품분류ID'})

items.head()

Unnamed: 0,상품명,상품ID,상품분류ID
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [6]:
item_categories = item_categories.rename(columns=
                                         {'item_category_name': '상품분류명',
                                          'item_category_id': '상품분류ID'})

item_categories.head()

Unnamed: 0,상품분류명,상품분류ID
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [7]:
test = test.rename(columns={'shop_id': '상점ID',
                            'item_id': '상품ID'})

test.head()

Unnamed: 0,ID,상점ID,상품ID
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [8]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.1f}% 압축됨'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [9]:
all_df = [sales_train, shops, items, item_categories, test]
for df in all_df:
    df = downcast(df)

54.2% 압축됨
38.6% 압축됨
54.2% 압축됨
39.9% 압축됨
70.8% 압축됨


In [10]:
from itertools import product

train = []
# 월ID, 상점ID, 상품ID 조합 생성
for i in sales_train['월ID'].unique():
    all_shop = sales_train.loc[sales_train['월ID']==i, '상점ID'].unique()
    all_item = sales_train.loc[sales_train['월ID']==i, '상품ID'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

idx_features = ['월ID', '상점ID', '상품ID'] # 기준 피처
# 리스트 타입인 train을 DataFrame 타입으로 변환
train = pd.DataFrame(np.vstack(train), columns=idx_features)

train

Unnamed: 0,월ID,상점ID,상품ID
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564
...,...,...,...
10913845,33,21,7635
10913846,33,21,7638
10913847,33,21,7640
10913848,33,21,7632


In [11]:
# idx_features를 기준으로 그룹화해 판매량 합 구하기
group = sales_train.groupby(idx_features).agg({'판매량': 'sum'})
# 인덱스 재설정
group = group.reset_index()
# 피처명을 '판매량'에서 '월간 판매량'으로 변경
group = group.rename(columns={'판매량': '월간 판매량'})

group

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,0,32,6
1,0,0,33,3
2,0,0,35,1
3,0,0,43,1
4,0,0,51,2
...,...,...,...,...
1609119,33,59,22087,6
1609120,33,59,22088,2
1609121,33,59,22091,1
1609122,33,59,22100,1


In [12]:
# train과 group 병합하기
train = train.merge(group, on=idx_features, how='left')

train

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,59,22154,1.0
1,0,59,2552,
2,0,59,2554,
3,0,59,2555,
4,0,59,2564,
...,...,...,...,...
10913845,33,21,7635,
10913846,33,21,7638,
10913847,33,21,7640,
10913848,33,21,7632,


In [13]:
import gc # 가비지 컬렉터 불러오기

del group # 더는 사용하지 않는 변수 지정
gc.collect(); # 가비지 컬렉션 수행

In [14]:
test['월ID'] = 34

In [15]:
# train과 test 이어붙이기
all_data = pd.concat([train, test.drop('ID', axis=1)],
                     ignore_index=True, # 기존 인덱스 무시(0부터 새로 시작)
                     keys=idx_features) # 이어붙이는 기준이 되는 피처

In [16]:
# 결측값을 0으로 대체
all_data = all_data.fillna(0)

all_data

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,59,22154,1.0
1,0,59,2552,0.0
2,0,59,2554,0.0
3,0,59,2555,0.0
4,0,59,2564,0.0
...,...,...,...,...
11128045,34,45,18454,0.0
11128046,34,45,16188,0.0
11128047,34,45,15757,0.0
11128048,34,45,19648,0.0


In [17]:
# 나머지 데이터 병합
all_data = all_data.merge(shops, on='상점ID', how='left')
all_data = all_data.merge(items, on='상품ID', how='left')
all_data = all_data.merge(item_categories, on='상품분류ID', how='left')

# 데이터 다운캐스팅
all_data = downcast(all_data)

# 가비지 컬렉션
del shops, items, item_categories
gc.collect();

26.4% 압축됨


In [18]:
all_data.head()

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량,상점명,상품명,상품분류ID,상품분류명
0,0,59,22154,1,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,0,59,2552,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
2,0,59,2554,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил
3,0,59,2555,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства
4,0,59,2564,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,Музыка - Музыкальное видео


In [19]:
all_data = all_data.drop(['상점명', '상품명', '상품분류명'], axis=1)

In [20]:
all_data.head()

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량,상품분류ID
0,0,59,22154,1,37
1,0,59,2552,0,58
2,0,59,2554,0,58
3,0,59,2555,0,56
4,0,59,2564,0,59


In [21]:
# 훈련 데이터 (피처)
X_train = all_data[all_data['월ID'] <= 33]
X_train = X_train.drop(['월간 판매량'], axis=1)
# # 검증 데이터 (피처)
# X_valid = all_data[all_data['월ID'] == 33]
# X_valid = X_valid.drop(['월간 판매량'], axis=1)
# 테스트 데이터 (피처)
X_test = all_data[all_data['월ID'] == 34]
X_test = X_test.drop(['월간 판매량'], axis=1)

# 훈련 데이터 (타깃값)
y_train = all_data[all_data['월ID'] <= 33]['월간 판매량']
y_train = y_train.clip(0, 20) # 타깃값을 0 ~ 20로 제한

# 데이터 (타깃값)
y_test = all_data[all_data['월ID'] == 34]['월간 판매량']
y_test = y_test.clip(0,20)
# # 검증 데이터 (타깃값)
# y_valid = all_data[all_data['월ID'] == 33]['월간 판매량']
# y_valid = y_valid.clip(0, 20)

In [None]:

print(len(X_test))

214200


# Autoencoding

In [None]:
# Autoencoder 모델 정의
input_layer = Input(shape=(4,))
encoded = Dense(3, activation='relu')(input_layer)
decoded = Dense(4, activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)

# Encoder 모델 정의
encoder = Model(input_layer, encoded)

# Decoder 모델 정의
encoded_input = Input(shape=(3,))
decoder_layer = autoencoder.layers[-1]
decoder = Model(encoded_input, decoder_layer(encoded_input))

# Autoencoder 컴파일 및 학습
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_train, X_train, epochs=30, batch_size=256, shuffle=True, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7e33562ab370>

In [None]:
# x_train을 encoder를 통과시켜 인코딩된 피쳐를 얻음
encoded_features_train = encoder.predict(X_train)





NameError: ignored

In [None]:
all_data['상점ID'] = all_data['상점ID'].astype('category')
all_data['상품분류ID'] = all_data['상품분류ID'].astype('category')


In [None]:
# LightGBM 하이퍼파라미터
params = {'metric': 'rmse', # 평가지표 = rmse
          'num_leaves': 255,
          'learning_rate': 0.01,
          'force_col_wise': True,
          'random_state': 10}

# 범주형 피처 설정
# cat_features = ['상점ID', '상품분류ID']

# LightGBM 훈련 및 검증 데이터셋
dtrain = lgb.Dataset(encoded_features_train, y_train)
# dvalid = lgb.Dataset(X_valid, y_valid)

# LightGBM 모델 훈련
lgb_model = lgb.train(params=params,
                      train_set = dtrain,
                      num_boost_round=500,
                      # valid_sets=(dtrain, dvalid),
                      # categorical_feature=cat_features
                      )

[LightGBM] [Info] Total Bins 267
[LightGBM] [Info] Number of data points in the train set: 10913850, number of used features: 3
[LightGBM] [Info] Start training from score 0.298240


In [None]:
encoded_features_test = encoder.predict(X_test)

preds = lgb_model.predict(encoded_features_test).clip(0, 20)



In [None]:
rmse = np.sqrt(np.mean((y_test - preds) ** 2))
rmse

0.3467258039298026

# PCA

In [None]:
# PCA 객체 생성. 여기에서는 2개의 주성분으로 데이터를 축소합니다.
pca2 = PCA(n_components=2)

# 데이터에 PCA 적용
X_pca2 = pca2.fit_transform(X_train)

In [None]:
# PCA 객체 생성. 여기에서는 2개의 주성분으로 데이터를 축소합니다.
pca3 = PCA(n_components=3)

# 데이터에 PCA 적용
X_pca3 = pca3.fit_transform(X_train)

In [None]:
# PCA 객체 생성. 여기에서는 2개의 주성분으로 데이터를 축소합니다.
pca4 = PCA(n_components=4)

# 데이터에 PCA 적용
X_pca4 = pca4.fit_transform(X_train)

In [None]:
print("Explained variance ratio:", pca2.explained_variance_ratio_)
print("Explained variance ratio:", pca3.explained_variance_ratio_)
print("Explained variance ratio:", pca4.explained_variance_ratio_)

Explained variance ratio: [9.99984014e-01 7.81342852e-06]
Explained variance ratio: [9.99984014e-01 7.81342852e-06 5.84339143e-06]
Explained variance ratio: [9.99984014e-01 7.81342852e-06 5.84339143e-06 2.32910474e-06]


In [None]:
X_train_pca2 = pca2.transform(X_train)
X_test_pca2 = pca2.transform(X_test)

In [None]:
params = {'metric': 'rmse', # 평가지표 = rmse
          'num_leaves': 255,
          'learning_rate': 0.01,
          'force_col_wise': True,
          'random_state': 10}

dtrain = lgb.Dataset(X_train_pca2, y_train)
# dvalid = lgb.Dataset(X_valid, y_valid)

# LightGBM 모델 훈련
lgb_model = lgb.train(params=params,
                      train_set = dtrain,
                      num_boost_round=500,
                      # valid_sets=(dtrain, dvalid),
                      # categorical_feature=cat_features
                      )

[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 10913850, number of used features: 2
[LightGBM] [Info] Start training from score 0.298240


In [None]:
preds = lgb_model.predict(X_test_pca2).clip(0, 20)

rmse = np.sqrt(np.mean((y_test - preds) ** 2))
rmse

0.4053635235301112

===============================================================================================================================================

In [None]:
dtrain = lgb.Dataset(X_train, y_train)
# dvalid = lgb.Dataset(X_valid, y_valid)

# LightGBM 모델 훈련
lgb_model = lgb.train(params=params,
                      train_set = dtrain,
                      num_boost_round=500,
                      # valid_sets=(dtrain, dvalid),
                      # categorical_feature=cat_features
                      )

[LightGBM] [Info] Total Bins 430
[LightGBM] [Info] Number of data points in the train set: 10913850, number of used features: 4
[LightGBM] [Info] Start training from score 0.298240


In [None]:
preds = lgb_model.predict(X_test).clip(0, 20)

rmse = np.sqrt(np.mean((y_test - preds) ** 2))
rmse

0.5523917093095065

=======================================================================================================

In [None]:
X_train_pca4 = pca4.transform(X_train)
X_test_pca4 = pca4.transform(X_test)

In [None]:
params = {'metric': 'rmse', # 평가지표 = rmse
          'num_leaves': 255,
          'learning_rate': 0.01,
          'force_col_wise': True,
          'random_state': 10}

dtrain = lgb.Dataset(X_train_pca4, y_train)
# dvalid = lgb.Dataset(X_valid, y_valid)

# LightGBM 모델 훈련
lgb_model = lgb.train(params=params,
                      train_set = dtrain,
                      num_boost_round=500,
                      # valid_sets=(dtrain, dvalid),
                      # categorical_feature=cat_features
                      )

[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 10913850, number of used features: 4
[LightGBM] [Info] Start training from score 0.298240


In [None]:
preds = lgb_model.predict(X_test_pca4).clip(0, 20)

rmse = np.sqrt(np.mean((y_test - preds) ** 2))
rmse

0.5184726694506331