In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
import sys
sys.path.append('../')

from myutils.metrics import mapk

In [3]:
OUTPUT_NAME = 'nmf'

In [4]:
train_log_df = pd.read_csv('../datasets/atmaCup16_Dataset/train_log.csv')
train_df = pd.read_csv('../datasets/atmaCup16_Dataset/train_label.csv')
test_log_df = pd.read_csv('../datasets/atmaCup16_Dataset/test_log.csv')
yado_df = pd.read_csv('../datasets/atmaCup16_Dataset/yado.csv')

train_session_only_df = train_df[['session_id']]
test_session_only_df = pd.read_csv('../datasets/atmaCup16_Dataset/test_session.csv')

train_test_log_df = pd.concat([train_log_df, test_log_df], axis=0).reset_index(drop=True)

In [5]:
sentence_list = train_test_log_df.groupby('session_id')['yad_no'].apply(list).tolist()

In [6]:
edges = set()
for items in sentence_list:
    if len(items) == 1:
        continue
    for i in range(len(items)-1):
        edges.add((items[i]-1, items[i+1]-1))
        edges.add((items[i+1]-1, items[i]-1))
edges = list(edges)

In [7]:
sparse_data = np.ones(len(edges))

In [8]:
import scipy
n = len(yado_df)

sparse_row = [edge[0] for edge in edges]
sparse_col = [edge[1] for edge in edges]
csr_matrix = scipy.sparse.csr_matrix((sparse_data, (sparse_row, sparse_col)), shape=(n, n))

## NMF

In [9]:
%%time

from sklearn.decomposition import NMF

dimensions = 256

nmf = NMF(n_components=dimensions, random_state=42, max_iter=1000)
U = nmf.fit_transform(csr_matrix)
V = nmf.components_

CPU times: user 3min 3s, sys: 2min 30s, total: 5min 34s
Wall time: 2min 5s


In [10]:
U.shape, V.shape

((13806, 256), (256, 13806))

In [11]:
item_vecs = U

In [12]:
# item_ids = train_test_log_df['yad_no'].unique().tolist()
item_ids = yado_df['yad_no'].to_list()
item_factors = {item: item_vecs[item-1, :] for item in item_ids}
item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "yad_no"})

item_cols = [f"item_factor_{i}" for i in range(dimensions)]
item_factors_df.columns = ["yad_no"] + item_cols

In [13]:
user_item_list_dict = {session_id: yad_nos.tolist() for session_id, yad_nos in train_test_log_df.groupby('session_id')['yad_no']}
user_factors = {user_id: np.mean([item_factors[item_id] for item_id in user_item_list], axis=0) for user_id, user_item_list in user_item_list_dict.items()}
user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "session_id"})
user_cols = [f"user_factor_{i}" for i in range(dimensions)]
user_factors_df.columns = ["session_id"] + user_cols

user_factors_last_item = {user_id: item_factors[user_item_list[-1]] for user_id, user_item_list in user_item_list_dict.items()}
user_factors_last_item_df = pd.DataFrame(user_factors_last_item).T.reset_index().rename(columns={"index": "session_id"})
user_cols = [f"user_factor_{i}" for i in range(dimensions)]
user_factors_last_item_df.columns = ["session_id"] + user_cols

In [14]:
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [15]:
from sklearn.neighbors import NearestNeighbors


user_vecs = user_factors_df[[f'user_factor_{i}' for i in range(dimensions)]].values
# user_vecs = user_factors_last_item_df[[f'user_factor_{i}' for i in range(dimensions)]].values
item_vecs = item_factors_df[[f'item_factor_{i}' for i in range(dimensions)]].values

nn = NearestNeighbors(n_neighbors=20, metric='cosine')
nn.fit(item_vecs)
dists, indices = nn.kneighbors(user_vecs)

In [16]:
user_last_item_dict = {user_id: user_item_list[-1] for user_id, user_item_list in user_item_list_dict.items()}

In [17]:
results = []

for i, user_id in enumerate(user_item_list_dict.keys()):
    for j in range(20):
        yad_no = indices[i][j]+1
        if user_last_item_dict[user_id] == yad_no:
            continue

        results.append([user_id, yad_no, 1 - dists[i][j]])
word2vec_sim_df = pd.DataFrame(results, columns=['session_id', 'yad_no', 'word2vec_sim'])

In [18]:
word2vec_sim_df

Unnamed: 0,session_id,yad_no,word2vec_sim
0,000007603d533d30453cc45d0f3d119f,8927,0.999902
1,000007603d533d30453cc45d0f3d119f,13544,0.999647
2,000007603d533d30453cc45d0f3d119f,10740,0.999647
3,000007603d533d30453cc45d0f3d119f,5821,0.999647
4,000007603d533d30453cc45d0f3d119f,13697,0.999647
...,...,...,...
8850452,fffffa7baf370083ebcdd98f26a7e31a,8609,0.992726
8850453,fffffa7baf370083ebcdd98f26a7e31a,8462,0.992726
8850454,fffffa7baf370083ebcdd98f26a7e31a,6247,0.992726
8850455,fffffa7baf370083ebcdd98f26a7e31a,6524,0.992116


In [19]:
def make_candidate_word2vec(session_only_df: pd.DataFrame, word2vec_sim_df: pd.DataFrame):
    session_df = session_only_df.merge(word2vec_sim_df, on=['session_id'], how='left')

    session_df = session_df[session_df['yad_no'].notnull()].reset_index(drop=True)
    session_df['yad_no'] = session_df['yad_no'].astype(int)

    session_df['rank'] = session_df.groupby('session_id')['word2vec_sim'].rank(ascending=False, method='min')

    pred_df = session_df.groupby('session_id')['yad_no'].apply(lambda x: x.tolist()).reset_index()
    pred_df = pd.merge(session_only_df, pred_df, on='session_id', how='left')

    pred_df['yad_no'] = pred_df['yad_no'].apply(lambda d: d if isinstance(d, list) else [])

    return session_df, pred_df

In [20]:
train_session_df, train_pred_df = make_candidate_word2vec(train_session_only_df, word2vec_sim_df)
test_session_df, test_pred_df = make_candidate_word2vec(test_session_only_df, word2vec_sim_df)

In [21]:
train_session_df

Unnamed: 0,session_id,yad_no,word2vec_sim,rank
0,000007603d533d30453cc45d0f3d119f,8927,0.999902,1.0
1,000007603d533d30453cc45d0f3d119f,13544,0.999647,2.0
2,000007603d533d30453cc45d0f3d119f,10740,0.999647,2.0
3,000007603d533d30453cc45d0f3d119f,5821,0.999647,2.0
4,000007603d533d30453cc45d0f3d119f,13697,0.999647,2.0
...,...,...,...,...
5513709,fffffa7baf370083ebcdd98f26a7e31a,8609,0.992726,14.0
5513710,fffffa7baf370083ebcdd98f26a7e31a,8462,0.992726,14.0
5513711,fffffa7baf370083ebcdd98f26a7e31a,6247,0.992726,17.0
5513712,fffffa7baf370083ebcdd98f26a7e31a,6524,0.992116,18.0


## cv

In [22]:
mapk(train_df['yad_no'].to_list(), train_pred_df['yad_no'].to_list(), k=10)

0.15092698065216062

In [23]:
train_session_df.to_pickle(f'../datasets/candidate_df/{OUTPUT_NAME}_train.pkl')
test_session_df.to_pickle(f'../datasets/candidate_df/{OUTPUT_NAME}_test.pkl')

In [24]:
import os
os.makedirs(f'../datasets/{OUTPUT_NAME}_pkl', exist_ok=True)

user_factors_df.to_pickle(f'../datasets/{OUTPUT_NAME}_pkl/{OUTPUT_NAME}_user_factors_df.pkl')
user_factors_last_item_df.to_pickle(f'../datasets/{OUTPUT_NAME}_pkl/{OUTPUT_NAME}_user_factors_last_item_df.pkl')
item_factors_df.to_pickle(f'../datasets/{OUTPUT_NAME}_pkl/{OUTPUT_NAME}_item_factors_df.pkl')