In [1]:
# основные модули
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# собственные модулей
from src.utils import prefilter_items
from src.metrics import precision_at_k, recall_at_k, recall, precision, evaluate_pred

# модель для 1го уровня
from src.recommenders import MainRecommender

# модели для 2го уровня
from lightgbm import LGBMClassifier

# отключение предупреждений
import warnings
warnings.filterwarnings('ignore')

In [2]:
# создание датафреймов
train = pd.read_csv('retail_train.csv')
test = pd.read_csv('retail_test1.csv')

# зегрузка фичей
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [3]:
# Снижение регистров столбцов
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

# Переименование столбцов
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [4]:
# объединение датафреймов
train_ = train.merge(item_features, on='item_id', how='left')
train_ = train.merge(user_features, on='user_id', how='left')

test_ = test.merge(item_features, on='item_id', how='left')
test_ = test.merge(user_features, on='user_id', how='left')

In [5]:
# префильтрация
train_, test_ = map(prefilter_items, (train_, test_))

In [6]:
# обучение одноуровневой модели
recommender_ = MainRecommender(train_)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [7]:
# создание фрейма результатов одноуровневой модели
result_ = test_.groupby('user_id')['item_id'].unique().reset_index()
result_.columns = ['user_id', 'actual']

# удаление тех пользователей, на которых модель не обучалась
result_ = result_[result_['user_id'].isin(train_['user_id'])]
result_.head()

Unnamed: 0,user_id,actual
0,1,"[999999, 883616, 940947, 959219, 991024, 10049..."
1,2,"[999999, 866211, 885023, 899624, 940947, 95141..."
2,3,"[989069, 1130858]"
3,6,"[847738, 999999, 948650, 1082398, 1100159, 127..."
4,7,"[859987, 863407, 895454, 999999, 930918, 95467..."


In [8]:
result_['user_id'].unique()

array([   1,    2,    3, ..., 2498, 2499, 2500], dtype=int64)

In [9]:
# подбор числа кандидатов


NN = [100, 150, 200, 250, 300]

for N in NN:
  
    result_['als'] = result_['user_id'].apply(lambda x: recommender_.get_als_recommendations(user=x, N=N))# apply
    result_['own'] = result_['user_id'].apply(lambda x: recommender_.get_own_recommendations(user=x, N=N))# apply
    
    als_rec = result_.apply(lambda row: recall(row['als'], row['actual']), axis=1).mean()
    own_rec = result_.apply(lambda row: recall(row['own'], row['actual']), axis=1).mean()
    
    print('Recall:')
    print('N = {}'.format(N), 'als: {}'.format(als_rec), 'own: {}'.format(own_rec))

Recall:
N = 100 als: 0.17847952706423664 own: 0.2344595757013773
Recall:
N = 150 als: 0.2173434818792079 own: 0.2946341459670466
Recall:
N = 200 als: 0.2471506493324266 own: 0.3371108046069094
Recall:
N = 250 als: 0.2716618015886675 own: 0.36921270294981307
Recall:
N = 300 als: 0.294782523915888 own: 0.3927582881859291


### Обучение модели 1 уровня

In [10]:
# разбиение на тестовые и трейновые фреймы 1го и 2го уровня
weeks = 9

# 1й уровень
train_lvl1 = train_[train_['week_no'] < train_['week_no'].max() - weeks]
test_lvl1 = train_[train_['week_no'] >= train_['week_no'].max() - weeks]

# 2й уровень
train_lvl2 = test_lvl1.copy()

In [11]:
# обучение 1го уровня модели
recommender_lvl1 = MainRecommender(train_lvl1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [12]:
# создание фрейма результатов 1го уровня
result_lvl1 = test_lvl1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl1.columns = ['user_id', 'actual']

# удаление тех пользователей, на которых модель не обучалась
result_lvl1 = result_lvl1[result_lvl1['user_id'].isin(train_lvl1['user_id'])]
result_lvl1.head()

Unnamed: 0,user_id,actual
0,1,"[865456, 999999, 878285, 922281, 940947, 98389..."
1,2,"[999999, 839656, 866211, 893867, 1011457, 1103..."
2,3,[999999]
3,4,"[883932, 970760, 1035676, 999999, 831063, 8914..."
4,6,"[1024306, 6548453, 1098844, 999999, 8357613, 9..."


In [13]:
result_lvl1['own'] = result_lvl1['user_id'].apply(
    lambda x: recommender_lvl1.get_own_recommendations(user=x, N=200)
)# apply

In [14]:
result_lvl1


Unnamed: 0,user_id,actual,own
0,1,"[865456, 999999, 878285, 922281, 940947, 98389...","[856942, 9297615, 5577022, 9655212, 888104, 10..."
1,2,"[999999, 839656, 866211, 893867, 1011457, 1103...","[911974, 1076580, 5567582, 1103898, 1056620, 9..."
2,3,[999999],"[1092937, 1008714, 12132312, 1075979, 998206, ..."
3,4,"[883932, 970760, 1035676, 999999, 831063, 8914...","[6391541, 1052294, 891423, 936470, 1137010, 83..."
4,6,"[1024306, 6548453, 1098844, 999999, 8357613, 9...","[13003092, 972416, 995598, 923600, 1138596, 10..."
...,...,...,...
2253,2496,"[831509, 999999, 820321, 839243, 865456, 98239...","[983665, 872826, 991546, 1134296, 12452939, 74..."
2254,2497,"[824759, 999999, 965430, 970202, 977867, 98128...","[870515, 1102207, 1117219, 1057168, 1135834, 1..."
2255,2498,"[865511, 962991, 999999, 5564901, 844991, 9407...","[1022066, 1076580, 1100379, 5565356, 931579, 8..."
2256,2499,"[999999, 882308, 865992, 869322, 899624, 90412...","[7168055, 1128395, 6904613, 5570048, 830202, 8..."


In [15]:
result_lvl1.apply(lambda row: recall(row['own'], row['actual']), axis=1).mean()


0.36659393538071816

In [16]:
result_lvl1.apply(lambda row: precision_at_k(row['own'], row['actual']), axis=1).mean()


0.20896184560780592

### Обучение 2го уровня модели

In [17]:
# формирование фрейма с результатами прогнозирования 1го уровня для 2го уровня
result_lvl2 = pd.DataFrame(train_lvl2['user_id'].unique())
result_lvl2.columns = ['user_id']

# отбор пользователей для горячего старта
train_users = train_lvl1['user_id'].unique()
result_lvl2 = result_lvl2[result_lvl2['user_id'].isin(train_users)]

# Добавление по 100 items которые отбирает 1 уровень модели
result_lvl2['candidates'] = result_lvl2['user_id'].apply(lambda x: recommender_lvl1.get_own_recommendations(x, N=200))

result_lvl2.head()

Unnamed: 0,user_id,candidates
0,2021,"[950935, 1119454, 835578, 863762, 1019142, 102..."
1,1753,"[967041, 963686, 948640, 1057168, 942475, 9553..."
2,2120,"[5707857, 1029743, 1106523, 5569230, 916122, 8..."
3,1346,"[1135983, 5569309, 1129982, 5574377, 5569993, ..."
4,2430,"[9392700, 9803545, 1020770, 967041, 831161, 97..."


In [18]:
# формирование фрейма для 2го уровня модели

# вытаскиваем всех пользователей
users_array = result_lvl2['user_id'].values

# вытаскиваем все items
candidates_lists = result_lvl2['candidates']
len_candidates= len(candidates_lists[0])
candidates_array = candidates_lists.values

# формируем фрейм с спрогнозированными результатами
df = pd.DataFrame({'user_id':users_array.repeat(len_candidates),
                   'item_id':np.concatenate(candidates_array)})
df.head()

Unnamed: 0,user_id,item_id
0,2021,950935
1,2021,1119454
2,2021,835578
3,2021,863762
4,2021,1019142


In [19]:
# Формируем фрейм рекоммендаций user_item

# фиксируем фактическое взаимодействие
targets_train_lvl2 = train_lvl2[['user_id', 'item_id']].copy()
# отмечаем их как 1
targets_train_lvl2['target'] = 1

# объединяем фрейм фактического взаимодействия с предсказанным по совпадению пользователь-товар
targets_train_lvl2 = df.merge(targets_train_lvl2, on=['user_id', 'item_id'], how='left')
# если появятся не зафиксированные взаимодействия, отметим их как 0 
targets_train_lvl2['target'].fillna(0, inplace=True)

targets_train_lvl2

Unnamed: 0,user_id,item_id,target
0,2021,950935,0.0
1,2021,1119454,0.0
2,2021,835578,0.0
3,2021,863762,0.0
4,2021,1019142,1.0
...,...,...,...
464091,832,1000237,0.0
464092,832,904129,0.0
464093,832,909268,0.0
464094,832,823721,0.0


In [20]:
# Добавляем фичи для user, items и user_items
targets_train_lvl2 = targets_train_lvl2.merge(item_features, on='item_id', how='left')
targets_train_lvl2 = targets_train_lvl2.merge(user_features, on='user_id', how='left')

targets_train_lvl2.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2021,950935,0.0,2193,GROCERY,National,FRZN NOVELTIES/WTR ICE,WATER ICE,24 CT,,,,,,,
1,2021,1119454,0.0,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,HAMBURGER BUNS,8 OZ,,,,,,,
2,2021,835578,0.0,539,DRUG GM,National,CIGARETTES,CIGARETTES,971267 PK,,,,,,,
3,2021,863762,0.0,6046,DRUG GM,National,PREPAID WIRELESS&ACCESSORIES,PREPAID WIRELESS CARDS,,,,,,,,
4,2021,1019142,1.0,1007,MEAT-PCKGD,National,BREAKFAST SAUSAGE/SANDWICHES,ROLLS - FLAVORED/OTHER,1 LB,,,,,,,


In [21]:
X_train = targets_train_lvl2.drop('target', axis=1)
y_train = targets_train_lvl2[['target']]

# Отмечаем категориальные признаки
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

### Обучение LightGBM

In [22]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

# прогнозирование train выборки
train_preds = lgb.predict(X_train)
train_preds

array([0., 0., 0., ..., 0., 0., 0.])

In [23]:
def eval_lgbm(targets, preds, k=5):
    targets['recommend'] = preds
    targets = targets[['user_id', 'item_id', 'target', 'recommend']]
    
    target = targets[targets['target'] == 1]
    target = target.groupby('user_id')['item_id'].unique().reset_index()
    target.columns = ['user_id', 'target']
    
    recommend = targets[targets['recommend'] == 1]
    recommend = recommend.groupby('user_id')['item_id'].unique().reset_index()
    recommend.columns = ['user_id', 'recommend']
    
    target_recommend = target.merge(recommend, on='user_id')
    
    result = evaluate_pred(data=target_recommend, true='target', k=k)
    
    return result

In [24]:
eval_lgbm(targets=targets_train_lvl2, preds=train_preds)


recommend : 0.5254981884057971


{'recommend': 0.5254981884057971}

In [25]:
targets_test_lvl2 = test[['user_id', 'item_id']].copy()
targets_test_lvl2['target'] = 1

targets_test_lvl2 = df.merge(targets_test_lvl2, on=['user_id', 'item_id'], how='left')

targets_test_lvl2['target'].fillna(0, inplace= True)

targets_test_lvl2

Unnamed: 0,user_id,item_id,target
0,2021,950935,0.0
1,2021,1119454,0.0
2,2021,835578,0.0
3,2021,863762,0.0
4,2021,1019142,0.0
...,...,...,...
453093,832,1000237,0.0
453094,832,904129,0.0
453095,832,909268,0.0
453096,832,823721,0.0


In [26]:

targets_test_lvl2 = targets_test_lvl2.merge(item_features, on='item_id', how='left')
targets_test_lvl2 = targets_test_lvl2.merge(user_features, on='user_id', how='left')

targets_test_lvl2.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2021,950935,0.0,2193,GROCERY,National,FRZN NOVELTIES/WTR ICE,WATER ICE,24 CT,,,,,,,
1,2021,1119454,0.0,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,HAMBURGER BUNS,8 OZ,,,,,,,
2,2021,835578,0.0,539,DRUG GM,National,CIGARETTES,CIGARETTES,971267 PK,,,,,,,
3,2021,863762,0.0,6046,DRUG GM,National,PREPAID WIRELESS&ACCESSORIES,PREPAID WIRELESS CARDS,,,,,,,,
4,2021,1019142,0.0,1007,MEAT-PCKGD,National,BREAKFAST SAUSAGE/SANDWICHES,ROLLS - FLAVORED/OTHER,1 LB,,,,,,,


In [28]:
X_test = targets_test_lvl2.drop('target', axis=1)
y_test = targets_test_lvl2[['target']]

X_test[cat_feats] = X_test[cat_feats].astype('category')

In [29]:
test_preds = lgb.predict(X_test)
test_preds

array([0., 0., 0., ..., 0., 0., 0.])

In [30]:
eval_lgbm(targets=targets_test_lvl2, preds=test_preds)


recommend : 0.22539920159680646


{'recommend': 0.22539920159680646}