In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder


# 데이터 불러오기 및 간단한 확인

### - train데이터 불러오기 (날짜, 날짜 블럭, 상점 아이디, 물품 아이디, 가격, 판매량)

In [81]:
train = pd.read_csv('~/aiffel_code_master/k/sales_train.csv')
y = train['item_cnt_day']

train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]
del train['date']
del train['item_price']

train.head()



Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day
0,0,59,22154,1.0
1,0,25,2552,1.0
2,0,25,2552,-1.0
3,0,25,2554,1.0
4,0,25,2555,1.0


### - 월 별로 cnt하기!

In [82]:
temp = train.groupby(["item_id","shop_id","date_block_num"]).sum().reset_index()
temp = temp.rename(index=str, columns = {"item_cnt_day":"item_cnt_month"})
temp = temp[["item_id","shop_id","date_block_num","item_cnt_month"]]
y = temp['item_cnt_month']
#del temp['item_cnt_month']

train = temp
train.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month
0,0,54,20,1.0
1,1,55,15,2.0
2,1,55,18,1.0
3,1,55,19,1.0
4,1,55,20,1.0


### - 아이템 카테고리 불러오기 (물품 카테고리 이름, 카테고리 아이디)

In [83]:
item_categories = pd.read_csv('~/aiffel_code_master/k/item_categories.csv')
item_categories.head()


#PS나 XBOX는 콘솔게임으로 치환하기


Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


### - 아이템 데이터 불러오기 (물품 이름, 물품 아이디, 물품 카테고리 아이디)

In [84]:
items = pd.read_csv('~/aiffel_code_master/k/items.csv')
items.head()


Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [85]:
temp = items.drop_duplicates(['item_category_id'])
temp.sort_values(by=['item_category_id'], axis=0).head()

Unnamed: 0,item_name,item_id,item_category_id
5441,PC: Гарнитура HyperX Cloud Core игровая стерео...,5441,0
5606,PS2: Карта памяти 8 Мб черная (Memory Card 8Mb...,5606,1
5570,PS Move Controller (Контроллер движений),5570,2
5652,PS4/PS3: Гарнитура Tritton Kunai стерео провод...,5652,3
1955,Black Horns PSP Slim Джинсовый чехол серый (BH...,1955,4


### - shops 데이터 불러오기 (상점 이름, 상점 아이디)

In [86]:
shops = pd.read_csv('~/aiffel_code_master/k/shops.csv')
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


### - 테스트 데이터 불러오기

In [87]:
test = pd.read_csv('~/aiffel_code_master/k/test.csv')
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


### 데이터 정보

ID - an Id that represents a (Shop, Item) tuple within the test set

shop_id - unique identifier of a shop

item_id - unique identifier of a product

item_category_id - unique identifier of item category

item_cnt_day - number of products sold. You are predicting a monthly amount of this measure

item_price - current price of an item

date - date in format dd/mm/yyyy

date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33

item_name - name of item

shop_name - name of shop

item_category_name - name of item category


In [88]:
train.describe()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month
count,1609123.0,1609123.0,1609123.0,1609123.0
mean,10680.99,32.80586,14.6648,2.265853
std,6238.884,16.53701,9.542325,8.466196
min,0.0,0.0,0.0,-22.0
25%,5045.0,21.0,6.0,1.0
50%,10497.0,31.0,14.0,1.0
75%,16060.0,47.0,23.0,2.0
max,22169.0,59.0,33.0,1644.0


In [89]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1609123 entries, 0 to 1609122
Data columns (total 4 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   item_id         1609123 non-null  int64  
 1   shop_id         1609123 non-null  int64  
 2   date_block_num  1609123 non-null  int64  
 3   item_cnt_month  1609123 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 61.4+ MB


# 데이터 시각화



# EDA 시작

### - 결측치 확인

In [91]:
train.isnull().sum()

item_id           0
shop_id           0
date_block_num    0
item_cnt_month    0
dtype: int64

In [92]:
item_categories.isnull().sum()

item_category_name    0
item_category_id      0
dtype: int64

In [93]:
items.isnull().sum()

item_name           0
item_id             0
item_category_id    0
dtype: int64

In [94]:
shops.isnull().sum()

shop_name    0
shop_id      0
dtype: int64

## 가격대 별로 그룹 나누기

In [None]:
tmp = 

### - item_categories에서 비슷한 그룹 합치기


In [95]:
item_categories = pd.read_csv('~/aiffel_code_master/k/item_categories.csv')

In [96]:
item_categories['split'] = item_categories['item_category_name'].str.split('-')
item_categories['type'] = item_categories['split'].map(lambda x: x[0].strip())
del item_categories['split']


item_categories['category_code'] = LabelEncoder().fit_transform(item_categories['type'])
del item_categories['type']
del item_categories['item_category_name']
# if subtype is nan then type
#item_categories['subtype'] = item_categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
#item_categories['subtype_code'] = LabelEncoder().fit_transform(item_categories['subtype'])
#item_categories = item_categories[['item_category_id','type_code', 'subtype_code']]
item_categories

Unnamed: 0,item_category_id,category_code
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
79,79,16
80,80,16
81,81,17
82,82,18


In [97]:
train = pd.merge(train, items, on='item_id')
del train['item_name']


train = pd.merge(train, item_categories, on='item_category_id')
train.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,item_category_id,category_code
0,0,54,20,1.0,40,11
1,2,54,19,1.0,40,11
2,2,54,22,1.0,40,11
3,3,54,18,1.0,40,11
4,3,54,19,1.0,40,11


In [115]:
train_x = train[train['date_block_num'] <= 33]
val_x = train[train['date_block_num'] > 32]

train_y = train_x['item_cnt_month']
val_y = val_x['item_cnt_month']

del train_x['item_cnt_month']
del val_x['item_cnt_month']

### test data process

In [99]:
temp_test = test
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [100]:

temp_test['date_block_num'] = 34
temp_test = pd.merge(temp_test, items, on='item_id')
del temp_test['item_name']
temp_test = pd.merge(temp_test, item_categories, on='item_category_id')

test_data = temp_test[['item_id', 'shop_id', 'date_block_num', 'item_category_id','category_code']]
test_data.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_category_id,category_code
0,5037,5,34,19,5
1,5037,4,34,19,5
2,5037,6,34,19,5
3,5037,3,34,19,5
4,5037,2,34,19,5


In [101]:
sub_sample = pd.read_csv('~/aiffel_code_master/k/sample_submission.csv')
sub_sample.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


# Training

In [123]:
import xgboost as xgb
import lightgbm as lgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score

In [125]:
model2 = LGBMRegressor(
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model2.fit(
    train_x, 
    train_y, 
    eval_metric="rmse", 
    eval_set=[(train_x, train_y), (val_x, val_y)], 
    verbose=True, 
    early_stopping_rounds = 10)

[1]	training's rmse: 8.2292	training's l2: 67.7197	valid_1's rmse: 6.85739	valid_1's l2: 47.0237
Training until validation scores don't improve for 10 rounds
[2]	training's rmse: 8.03198	training's l2: 64.5128	valid_1's rmse: 6.62174	valid_1's l2: 43.8474
[3]	training's rmse: 7.95324	training's l2: 63.254	valid_1's rmse: 6.5926	valid_1's l2: 43.4624
[4]	training's rmse: 7.79811	training's l2: 60.8105	valid_1's rmse: 6.41	valid_1's l2: 41.088
[5]	training's rmse: 7.67339	training's l2: 58.8809	valid_1's rmse: 6.27469	valid_1's l2: 39.3718
[6]	training's rmse: 7.57816	training's l2: 57.4285	valid_1's rmse: 6.22915	valid_1's l2: 38.8023
[7]	training's rmse: 7.48633	training's l2: 56.0452	valid_1's rmse: 6.19091	valid_1's l2: 38.3274
[8]	training's rmse: 7.40816	training's l2: 54.8809	valid_1's rmse: 6.10854	valid_1's l2: 37.3143
[9]	training's rmse: 7.34358	training's l2: 53.9281	valid_1's rmse: 6.08985	valid_1's l2: 37.0862
[10]	training's rmse: 7.31715	training's l2: 53.5407	valid_1's r

[91]	training's rmse: 6.6462	training's l2: 44.1719	valid_1's rmse: 5.61742	valid_1's l2: 31.5554
[92]	training's rmse: 6.64496	training's l2: 44.1555	valid_1's rmse: 5.61622	valid_1's l2: 31.5419
[93]	training's rmse: 6.64302	training's l2: 44.1297	valid_1's rmse: 5.61805	valid_1's l2: 31.5625
[94]	training's rmse: 6.64191	training's l2: 44.1149	valid_1's rmse: 5.61805	valid_1's l2: 31.5624
[95]	training's rmse: 6.63675	training's l2: 44.0465	valid_1's rmse: 5.61241	valid_1's l2: 31.4991
[96]	training's rmse: 6.63454	training's l2: 44.0171	valid_1's rmse: 5.6124	valid_1's l2: 31.4991
[97]	training's rmse: 6.62979	training's l2: 43.9541	valid_1's rmse: 5.6104	valid_1's l2: 31.4766
[98]	training's rmse: 6.62716	training's l2: 43.9192	valid_1's rmse: 5.6105	valid_1's l2: 31.4777
[99]	training's rmse: 6.62143	training's l2: 43.8434	valid_1's rmse: 5.60593	valid_1's l2: 31.4265
[100]	training's rmse: 6.61799	training's l2: 43.7978	valid_1's rmse: 5.60237	valid_1's l2: 31.3866
[101]	trainin

[182]	training's rmse: 6.48469	training's l2: 42.0512	valid_1's rmse: 5.50342	valid_1's l2: 30.2877
[183]	training's rmse: 6.48355	training's l2: 42.0364	valid_1's rmse: 5.50369	valid_1's l2: 30.2906
Early stopping, best iteration is:
[173]	training's rmse: 6.49432	training's l2: 42.1762	valid_1's rmse: 5.5028	valid_1's l2: 30.2808


LGBMRegressor(colsample_bytree=0.8, eta=0.3, min_child_weight=300,
              n_estimators=1000, seed=42, subsample=0.8)

In [126]:
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    train_x, 
    train_y, 
    eval_metric="rmse", 
    eval_set=[(train_x, train_y), (val_x, val_y)], 
    verbose=True, 
    early_stopping_rounds = 10)

[0]	validation_0-rmse:7.94563	validation_1-rmse:6.48401
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:7.53851	validation_1-rmse:6.06861
[2]	validation_0-rmse:7.34035	validation_1-rmse:5.88197
[3]	validation_0-rmse:7.20411	validation_1-rmse:5.86518
[4]	validation_0-rmse:7.11850	validation_1-rmse:5.78892
[5]	validation_0-rmse:7.06293	validation_1-rmse:5.78517
[6]	validation_0-rmse:7.02207	validation_1-rmse:5.74882
[7]	validation_0-rmse:6.98489	validation_1-rmse:5.74152
[8]	validation_0-rmse:6.93413	validation_1-rmse:5.68332
[9]	validation_0-rmse:6.89460	validation_1-rmse:5.67033
[10]	validation_0-rmse:6.86020	validation_1-rmse:5.67829
[11]	validation_0-rmse:6.83386	validation_1-rmse:5.67119
[12]	validation_0-rmse:6.81055	validation_1-rmse:5.68236
[13]	validation_0-rmse:6.75877	validation_1-rmse:5.61565
[14]	validation_0-rmse:6.72056	validation_1-rmse:5.57

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=8,
             min_child_weight=300, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             subsample=0.8, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [55]:
test.head()

Unnamed: 0,ID,shop_id,item_id,data_block_num,date_block_num
0,0,5,5037,34,34
1,1,5,5320,34,34
2,2,5,5233,34,34
3,3,5,5232,34,34
4,4,5,5268,34,34


In [49]:
train_data.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_category_id,category_code
0,0,54,20,40,11
1,2,54,19,40,11
2,2,54,22,40,11
3,3,54,18,40,11
4,3,54,19,40,11


In [128]:
test_y = model.predict(test_data).clip(0, 20)
test_y2 = model2.predict(test_data).clip(0, 20)



In [130]:
print(len(test_y), len(test_y2))
a = np.mean()

214200 214200


numpy.ndarray

In [None]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": y_pred
})
submission.to_csv('xgb_submission.csv', index=False)