In [1]:
import sys
from pathlib import Path
sys.path.append(Path().absolute())

In [2]:
from item_based_recommender import ItemBasedRecommender, split
from user_based_recommender import UserBasedRecommender
from most_popular_recommender import MostPopularRecommender
from ensemble import Ensemble
import pandas as pd
import numpy as np
import pickle
import random

In [9]:
orders = pd.read_csv('orders.csv')
order_products_train = pd.read_csv('order_products__train.csv')
order_products_prior = pd.read_csv('order_products__prior.csv')
order_products = pd.concat([order_products_train, order_products_prior])[['order_id', 'product_id']]
order_products.to_csv('order_products.csv', index=False)
train_indices, validation_indices, test_indices = split(orders)
            
orders = orders[['user_id', 'order_id']]
train_df = pd.concat([orders.loc[train_indices], orders.loc[validation_indices]])
train_valid_order_products = order_products[order_products['order_id'].isin(train_df['order_id'])]
train_valid_order_products.to_csv('train_valid_order_products.csv', index=False)
validation_df = orders.loc[validation_indices]
test_df = orders.loc[test_indices]

In [12]:
def pipeline(recommender, loop_num, user_num, predictions_num):
    recommender_class_name = type(recommender).__name__
    for i in range(loop_num):

        orders = pd.read_csv('orders.csv')
        order_products = pd.read_csv('order_products.csv')
        train_indices, validation_indices, test_indices = split(orders)
            
        orders = orders[['user_id', 'order_id']]
        train_df = pd.concat([orders.loc[np.concatenate([train_indices, validation_indices])]])
        train_valid_order_products = order_products[order_products['order_id'].isin(train_df['order_id'])]
        train_valid_order_products.to_csv('train_valid_order_products.csv', index=False)
        validation_df = orders.loc[validation_indices]
        test_df = orders.loc[test_indices]

        if recommender_class_name == 'MostPopularRecommender':
            recommender.train = train_df
            recommender.validation = validation_df
            recommender.test = test_df
            recommender.order_products_df = order_products
        if recommender_class_name == 'ItemBasedRecommender':
            recommender.train_data = pd.merge(train_df, order_products, on='order_id')
            recommender.validation_data = pd.merge(validation_df, order_products, on='order_id')
            recommender.test_data = pd.merge(test_df, order_products, on='order_id')

        recommender.fit()

        recommendations = {key: [] for key in range(1, user_num + 1)}
        for user_id in recommendations.keys():
            tmp = []
            if recommender_class_name == 'MostPopularRecommender':
                tmp = random.sample(recommender.predict([user_id], 100)['predictions'][0], predictions_num)
            if recommender_class_name == 'UserBasedRecommender':
                tmp = random.sample(list(recommender.predict(user_id)[user_id]), predictions_num)
            if recommender_class_name == 'ItemBasedRecommender':
                tmp = random.sample(list(recommender.predict(user_id)), predictions_num)
            recommendations[user_id] = tmp
                
#         пока просто сохраним предсказания
#         потом надо будет передать их в функцию, которая добавит всё в основные таблицы
        with open(str(predictions_num) + 'predBy' + recommender_class_name + str(i) + '.pickle', 'wb') as out:
            pickle.dump(recommendations, out)   

In [15]:
recommender1 = MostPopularRecommender(train_df, validation_df, test_df, orders, order_products)
pipeline(recommender1, 1, 100, 7)

recommender2 = ItemBasedRecommender(train_df, validation_df, test_df)
pipeline(recommender2, 1, 100, 7)

recommender3 = UserBasedRecommender('orders.csv', 'train_valid.csv', 10)
pipeline(recommender3, 1, 100, 7)

In [6]:
def pipeline_for_ensemble(loop_num, predictions_num):
    with open('users_subsample.pickle', 'rb') as inp:
        user_ids = pickle.load(inp)  
            
    for i in range(loop_num):

        orders = pd.read_csv('orders.csv')
        order_products = pd.read_csv('order_products.csv')
        train_indices, validation_indices, test_indices = split(orders)
            
        orders = orders[['user_id', 'order_id']]
        train_df = pd.concat([orders.loc[np.concatenate([train_indices, validation_indices])]])
        train_valid_order_products = order_products[order_products['order_id'].isin(train_df['order_id'])]
        train_valid_order_products.to_csv('train_valid_order_products.csv', index=False)
        validation_df = orders.loc[validation_indices]
        test_df = orders.loc[test_indices]        
        
        test_merged = pd.merge(test_df, order_products, on='order_id')
        actual = dict(test_merged.groupby('user_id')['product_id'].apply(list))

        mpr = MostPopularRecommender(train_df, validation_df, test_df, orders, order_products)
        mpr.fit()
        mpr_pred = {key: [] for key in user_ids}
        for user in user_ids:
            mpr_pred[user] = mpr.predict([user], 100)['predictions'][0]
        with open('predByMostPopularRecommender' + str(i) +'.pickle', 'wb') as out:
            pickle.dump(mpr_pred, out) 
        
        ibr = ItemBasedRecommender(pd.merge(train_df, order_products, on='order_id'),
                                   pd.merge(validation_df, order_products, on='order_id'),
                                   pd.merge(test_df, order_products, on='order_id'))
        ibr.fit()
        ibr_pred = {key: [] for key in user_ids}
        for user in user_ids:
            ibr_pred[user] = ibr.predict(user)
        with open('predByItemBasedRecommender' + str(i) +'.pickle', 'wb') as out:
            pickle.dump(ibr_pred, out) 
        
        ubr = UserBasedRecommender('orders.csv', 'train_valid_order_products.csv', 10)
        ubr.fit()
        ubr_pred = {key: [] for key in user_ids}
        for user in user_ids:
            ubr_pred[user] = ubr.predict(user_id=user)[user]
        with open('predByUserBasedRecommender.pickle', 'wb') as out:
            pickle.dump(ubr_pred, out) 
        
        ensemble = Ensemble(mpr_pred, ibr_pred, ubr_pred, actual)
        ensemble.fit(user_ids)
        recommendations, models = ensemble.predict()
        
        for user, products in recommendations.items():
            recommendations[user] = random.sample(list(products), predictions_num)
        
#         пока просто сохраним предсказания
#         потом надо будет передать их в функцию, которая добавит всё в основные таблицы
        with open(str(predictions_num) + 'predByEnsemble' + str(i) + '.pickle', 'wb') as out:
            pickle.dump(recommendations, out)   