In [1]:
# import packages
import time, math, os
from tqdm import tqdm
import gc
import collections
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
from collections import defaultdict
warnings.filterwarnings('ignore')

In [2]:
data_path = './data_raw/'
save_path = './tmp_results/'

In [4]:
# function to reduce memory
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df



In [5]:
# debug mode
def get_all_click_sample(data_path, sample_nums=10000):
    """
        sampling a portion from trainng test to debug
        sample_nums: number of samples
    """
    all_click = pd.read_csv(data_path + 'train_click_log.csv')
    all_user_ids = all_click.user_id.unique()

    sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) 
    all_click = all_click[all_click['user_id'].isin(sample_user_ids)]
    
    all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
    return all_click

In [10]:
# read click data
def get_all_click_df(data_path='./data_raw/', offline=True):
    if offline:
        all_click = pd.read_csv(data_path + 'train_click_log.csv')
    else:
        trn_click = pd.read_csv(data_path + 'train_click_log.csv')
        tst_click = pd.read_csv(data_path + 'testA_click_log.csv')

        all_click = pd.concat([trn_click, tst_click], ignore_index=True)

    all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
    return all_click

In [11]:
# load the whole dataset
all_click_df = get_all_click_df(offline=False)

In [19]:
all_click_df.head


<bound method NDFrame.head of          user_id  click_article_id  click_timestamp  click_environment  \
0         199999            160417    1507029570190                  4   
1         199999              5408    1507029571478                  4   
2         199999             50823    1507029601478                  4   
3         199998            157770    1507029532200                  4   
4         199998             96613    1507029671831                  4   
...          ...               ...              ...                ...   
1630628   221924             70758    1508211323220                  4   
1630629   207823            331116    1508211542618                  4   
1630630   207823            234481    1508211850103                  4   
1630631   207823            211442    1508212189949                  4   
1630632   207823            211401    1508212315718                  4   

         click_deviceGroup  click_os  click_country  click_region  \
0           

# Getting user-item-click_time dictionary

In [20]:
#    {user1: {item1: time1, item2: time2..}...}
def get_user_item_time(click_df):
    # Sort the DataFrame by timestamp to maintain chronological order
    click_df = click_df.sort_values('click_timestamp')
    
    # Group by 'user_id' and aggregate the columns into a list of tuples
    user_item_time_df = (
        click_df.groupby('user_id')[['click_article_id', 'click_timestamp']]
        .apply(lambda x: list(zip(x['click_article_id'], x['click_timestamp'])))
        .reset_index(name='item_time_list')
    )
    
    # Create a dictionary with 'user_id' as keys and 'item_time_list' as values
    user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
    
    return user_item_time_dict

# Getting top k articles with largest nunmber of clicks

In [14]:
def get_item_topk_click(click_df, k):
    topk_click = click_df['click_article_id'].value_counts().index[:k]
    return topk_click

In [21]:
def itemcf_sim(df):
    """
        compute item to item similarity matrix
        :param df: df
        :item_created_time_dict:  dict contains article creat time
        return : similarity matrix
    """
    
    user_item_time_dict = get_user_item_time(df)
    
    i2i_sim = {}
    item_cnt = defaultdict(int)
    for user, item_time_list in tqdm(user_item_time_dict.items()):
        # temporal considerations
        for i, i_click_time in item_time_list:
            item_cnt[i] += 1
            i2i_sim.setdefault(i, {})
            for j, j_click_time in item_time_list:
                if(i == j):
                    continue
                i2i_sim[i].setdefault(j, 0)
                
                i2i_sim[i][j] += 1 / math.log(len(item_time_list) + 1)
                
    i2i_sim_ = i2i_sim.copy()
    for i, related_items in i2i_sim.items():
        for j, wij in related_items.items():
            i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])
    
    pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))
    
    return i2i_sim_

In [23]:
i2i_sim = itemcf_sim(all_click_df)

100%|████████████████████████████████| 250000/250000 [00:09<00:00, 25414.53it/s]


# Recall by item-based collaborative filtering

In [24]:
def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):

    """
        :param user_id: user id
        :param user_item_time_dict:dict,   {user1: {item1: time1, item2: time2..}...}
        :param i2i_sim: dict,，item similarity matrix
        :param sim_item_topk: int，top k articles closed to the current one
        :param recall_item_num: int， number of recalled articles
        :param item_topk_click: list，articules with top k numbe of clicks       
        return: list {item1:score1, item2: score2...}
        
    """
    
    # list of items in user's browsing history
    user_hist_items = user_item_time_dict[user_id]
    
    item_rank = {}
    for loc, (i, click_time) in enumerate(user_hist_items):
        for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:
            if j in user_hist_items:
                continue
                
            item_rank.setdefault(j, 0)
            item_rank[j] +=  wij
    
    # the gap is filled with popular items (item_topk_click) that are not already present in item_rank
    if len(item_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in item_rank.items(): 
                continue
            item_rank[item] = - i - 100 # any negative number
            if len(item_rank) == recall_item_num:
                break
    
    item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]
        
    return item_rank

In [26]:
# 

In [27]:
user_recall_items_dict = collections.defaultdict(dict)

# load dict
user_item_time_dict = get_user_item_time(all_click_df)

# load item to item similarity
i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))

# number of similar items
sim_item_topk = 10

# number of recalled items
recall_item_num = 10

# filling with popular items
item_topk_click = get_item_topk_click(all_click_df, k=50)

for user in tqdm(all_click_df['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, 
                                                        sim_item_topk, recall_item_num, item_topk_click)

100%|██████████████████████████████████| 250000/250000 [19:10<00:00, 217.36it/s]


In [28]:
# convert dict to df
user_item_score_list = []

for user, items in tqdm(user_recall_items_dict.items()):
    for item, score in items:
        user_item_score_list.append([user, item, score])

recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])


100%|████████████████████████████████| 250000/250000 [00:03<00:00, 82882.87it/s]


In [29]:
# generate file for submission
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]

    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

In [30]:
# read test data
tst_click = pd.read_csv(data_path + 'testA_click_log.csv')
tst_users = tst_click['user_id'].unique()


tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]

# generate file for submission
submit(tst_recall, topk=5, model_name='itemcf_baseline')