In [1]:
import pandas as pd

prior_order_df = pd.read_csv('./data/order_products__prior.csv')
prior_order_train_df = pd.read_csv('./data/order_products__train.csv')
order_df = pd.read_csv('./data/orders.csv')
product_df = pd.read_csv('./data/products.csv')
aisles_df = pd.read_csv('./data/aisles.csv')
department_df = pd.read_csv('./data/departments.csv')
sample_df = pd.read_csv('./data/sample_submission.csv') # 결과 도출하는 예시 파일

### 중요 컬럼만 빼서 확인하기
1. 이번 RF는 물품 재구매를 위해 user_id와 product_id 두개의 컬럼을 기준으로 그룹화하여 사용해보았음.
2. 중요 컬럼에 구매 주기나 시간과 카트 넣는 순서 그리고 recency, frequency를 추가하여 새로운 df 만듦.

In [4]:
df = pd.read_csv('./data/data.csv')

In [6]:
df['days_since_prior_order'].unique()

array([nan, 15., 21., 29., 28., 19., 20., 14.,  0., 30., 10.,  3.,  8.,
       13., 27.,  6.,  9., 12.,  7., 17., 11.,  4.,  5.,  2., 23., 26.,
       25., 16.,  1., 18., 24., 22.])

In [8]:
features_df = df[['user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'add_to_cart_order', 'reordered']]

In [10]:
# 가장 최근에 주문한 내역 
max_order_numbers = df.groupby(['user_id', 'product_id'])['order_number'].max().reset_index()
max_order_numbers.columns = ['user_id', 'product_id', 'max_order_number']
df_search_r = pd.merge(df, max_order_numbers, on=['user_id', 'product_id'], how='left')
RFR = df_search_r[df_search_r['order_number'] == df_search_r['max_order_number']].groupby(['user_id', 'product_id'])['days_since_prior_order'].min().reset_index()

In [12]:
RFR.columns = ['user_id', 'product_id','recency']

In [14]:
user_order_frequency = df.groupby(['user_id', 'product_id'])['order_id'].nunique().reset_index()
RFR = pd.merge(RFR, user_order_frequency, on=['user_id', 'product_id'], how='inner')
RFR.rename(columns = {'order_id':'frequency'}, inplace=True)
RFR

Unnamed: 0,user_id,product_id,recency,frequency
0,1,196,30.0,10
1,1,10258,30.0,9
2,1,10326,28.0,1
3,1,12427,30.0,10
4,1,13032,30.0,3
...,...,...,...,...
13307948,206209,43961,18.0,3
13307949,206209,44325,9.0,1
13307950,206209,48370,30.0,1
13307951,206209,48697,9.0,1


In [16]:
features_df = features_df.merge(RFR[['user_id', 'product_id', 'recency']], on=['user_id', 'product_id'], how='left')  # R 값
features_df = features_df.merge(RFR[['user_id', 'product_id', 'frequency']], on=['user_id', 'product_id'], how='left')  # F 값

In [18]:
features_df['recency'].unique()

array([30., nan, 15., 29., 28., 21.,  0.,  8.,  3.,  6., 27., 13., 14.,
       10.,  7., 11., 17.,  9., 12., 20., 19.,  5.,  4.,  2., 23., 26.,
       25., 16.,  1., 18., 22., 24.])

In [20]:
features_df

Unnamed: 0,user_id,product_id,order_dow,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,recency,frequency
0,1,196,2,8,,1,0,30.0,10
1,1,14084,2,8,,2,0,,1
2,1,12427,2,8,,3,0,30.0,10
3,1,26088,2,8,,4,0,15.0,2
4,1,26405,2,8,,5,0,29.0,2
...,...,...,...,...,...,...,...,...,...
32434484,206209,14197,1,12,7.0,5,1,7.0,2
32434485,206209,38730,1,12,7.0,6,0,7.0,1
32434486,206209,31477,1,12,7.0,7,0,7.0,1
32434487,206209,6567,1,12,7.0,8,0,7.0,1


In [22]:
quarter_index = len(features_df) // 4
df1 = features_df.iloc[:quarter_index]          # 첫 번째 1/4
df2 = features_df.iloc[quarter_index:2*quarter_index]  # 두 번째 1/4
df3 = features_df.iloc[2*quarter_index:3*quarter_index]  # 세 번째 1/4
df4 = features_df.iloc[3*quarter_index:]         # 네 번째 1/4

### 분류 나무 이용하여 예측해보기

In [24]:
X = df1.drop(['reordered'], axis=1)
y = df1['reordered']

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [29]:
estimators = [
    ('scaler', StandardScaler()),
    ('clf', DecisionTreeClassifier())
]

pipe = Pipeline(estimators)

In [31]:
pipe.set_params(clf__max_depth=3)
pipe.set_params(clf__random_state=13)

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
# pipe.fit(X_train, y_train)

In [36]:
from sklearn.metrics import accuracy_score

y_pred_test = pipe.predict(X_test)
accuracy_score(y_test, y_pred_test)

0.8731708129216769

예측 결과 정확도 87%라는 매우 좋은 효과를 볼 수 있었습니다. 다음은 랜덤포레스트로 적용 또는 하이퍼파라미터 튜닝을 해봐야 할 것 같습니다.

### 하이퍼파라미터 튜닝
- 결과 : 16, 20이 최대 91%정도로 확인 되나 max_depth=24 되니까 점점 정확도가 내려가는 것으로 확인

In [56]:
max_depth = [6, 8, 10, 12, 16, 20, 24]

for depth in max_depth:
    df_clf = DecisionTreeClassifier(max_depth=depth, random_state=13)
    df_clf.fit(X_train, y_train)
    
    pred = df_clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    print('Max_depth = ', depth, ', Accuracy =', accuracy)

Max_depth =  6 , Accuracy = 0.8750328739560881
Max_depth =  8 , Accuracy = 0.8754349151165934
Max_depth =  10 , Accuracy = 0.8880765506101683
Max_depth =  12 , Accuracy = 0.896621775153548
Max_depth =  16 , Accuracy = 0.9116529965478107
Max_depth =  20 , Accuracy = 0.9172626114978222
Max_depth =  24 , Accuracy = 0.9155169389128671


### 랜덤포레스트

In [30]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(rf, X_train, y_train, return_train_score=True, n_jobs=-1)
np.mean(scores['train_score']), np.mean(scores['test_score'])

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\_utils.py", line 72, in __call__
    return self.func(**kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 129, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 192, in _parallel_build_trees
    tree._fit(
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\tree\_classes.py", line 303, in _fit
    y_encoded = np.zeros(y.shape, dtype=int)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 19.8 MiB for an array with shape (5189517, 1) and data type int32
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 489, in fit
    trees = Parallel(
            ^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 67, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 2007, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1650, in _get_outputs
    yield from self._retrieve()
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1754, in _retrieve
    self._raise_error_fast()
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1789, in _raise_error_fast
    error_job.get_result(self.timeout)
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 745, in get_result
    return self._return_or_raise()
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 763, in _return_or_raise
    raise self._result
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 19.8 MiB for an array with shape (5189517, 1) and data type int32

--------------------------------------------------------------------------------
1 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\_utils.py", line 72, in __call__
    return self.func(**kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 129, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 192, in _parallel_build_trees
    tree._fit(
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\tree\_classes.py", line 294, in _fit
    check_classification_targets(y)
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\multiclass.py", line 213, in check_classification_targets
    y_type = type_of_target(y, input_name="y")
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\multiclass.py", line 388, in type_of_target
    if xp.any(data != xp.astype(data, int)):
                      ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\_array_api.py", line 290, in astype
    return x.astype(dtype, copy=copy, casting=casting)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 19.8 MiB for an array with shape (5189517, 1) and data type int32
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 489, in fit
    trees = Parallel(
            ^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 67, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 2007, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1650, in _get_outputs
    yield from self._retrieve()
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1754, in _retrieve
    self._raise_error_fast()
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1789, in _raise_error_fast
    error_job.get_result(self.timeout)
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 745, in get_result
    return self._return_or_raise()
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 763, in _return_or_raise
    raise self._result
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 19.8 MiB for an array with shape (5189517, 1) and data type int32

--------------------------------------------------------------------------------
2 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\_utils.py", line 72, in __call__
    return self.func(**kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 129, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 182, in _parallel_build_trees
    sample_counts = np.bincount(indices, minlength=n_samples)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 39.6 MiB for an array with shape (5189518,) and data type int64
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 489, in fit
    trees = Parallel(
            ^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 67, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 2007, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1650, in _get_outputs
    yield from self._retrieve()
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1754, in _retrieve
    self._raise_error_fast()
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1789, in _raise_error_fast
    error_job.get_result(self.timeout)
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 745, in get_result
    return self._return_or_raise()
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 763, in _return_or_raise
    raise self._result
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 39.6 MiB for an array with shape (5189518,) and data type int64

--------------------------------------------------------------------------------
1 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\_utils.py", line 72, in __call__
    return self.func(**kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 129, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 192, in _parallel_build_trees
    tree._fit(
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\tree\_classes.py", line 303, in _fit
    y_encoded = np.zeros(y.shape, dtype=int)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 19.8 MiB for an array with shape (5189518, 1) and data type int32
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 489, in fit
    trees = Parallel(
            ^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 67, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 2007, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1650, in _get_outputs
    yield from self._retrieve()
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1754, in _retrieve
    self._raise_error_fast()
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 1789, in _raise_error_fast
    error_job.get_result(self.timeout)
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 745, in get_result
    return self._return_or_raise()
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\min99\anaconda3\Lib\site-packages\joblib\parallel.py", line 763, in _return_or_raise
    raise self._result
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 19.8 MiB for an array with shape (5189518, 1) and data type int32


In [32]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier

xgb = XGBClassifier(tree_method='hist', random_state=13)
scores = cross_validate(xgb, X_train, y_train, return_train_score=True)
np.mean(scores['train_score']), np.mean(scores['test_score'])

MemoryError: Unable to allocate 39.6 MiB for an array with shape (5189518,) and data type int64