In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2
from collections import Counter
import random
from utils import load_model, save_checkpoint
from pyfm import pylibfm
from fastFM import als
from sklearn.feature_extraction import DictVectorizer
import time
from scipy.sparse import csr_matrix, coo_matrix

In [2]:
import implicit

## PreProcess

In [3]:
food_data = pd.read_csv("data/food.csv")
rating_data = pd.read_csv("data/rating_train.csv")
user_data = pd.read_csv("data/user.csv")
food_data.replace('-', np.nan, inplace=True)
rating_data[['year', 'month', 'day']] = rating_data['date'].str.split('-', expand=True)
rating_data['date'] = rating_data['date'].str.replace('-','')
user_ids = user_data['userid'].sort_values(axis=0).reset_index(drop=True).values.tolist()
user_data.fillna(0, inplace=True)
user_data['age'] = user_data['age'].apply(lambda x: x//5)
user_data['age'] = user_data['age'].apply(lambda x: x/7)
user_data['mapped_userid'] = user_data['userid'].apply(lambda x: user_ids.index(x))

food_data.drop(['annotated_food_name'], axis=1, inplace=True)
for h in list(food_data):
    if h.endswith('cat'):
        food_data[h][food_data[h].notnull()] = 1
food_data.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [4]:
rating_data['mapped_userid'] = rating_data['userid'].apply(lambda x: user_ids.index(x))

In [9]:
#v = DictVectorizer()
user_features = user_data.drop(['userid', 'username', 'location', 'city', 'state', 'title', 'about_me', 'reasons', 'inspirations', 'friends_count'], axis=1).sort_values(by='mapped_userid').values
user_feature_list = ['age', 'gender', 'mapped_userid']
user_features[:,2] = user_features[:,2].astype(str)
user_features = [{k:v for k, v in zip(user_feature_list, u)} for u in user_features]
#print(v.fit_transform(user_features).shape)
#transformed_user_features = v.fit_transform(transformed_user_features).toarray()

#food_features = food_data.drop(list(food_data.filter(regex=".*_q")), axis=1)
#food_features = food_features.drop(list(food_data.filter(regex=".*cat")), axis = 1)
#food_features = food_features.drop(list(food_data.filter(regex=".*_unit")), axis=1)
#food_features = food_features.drop(['foodid'], axis=1)
food_features = food_data['foodid']
food_feature_list = list(food_features)
food_features = food_features.values.astype(str)
food_features = [{"foodid": f} for f in food_features]
#print(v.fit_transform(food_features).shape)
#food_features = [{k:v for k, v in zip(food_feature_list, f)} for f in food_features]
#transformed_food_features = v.fit_transform(transformed_food_features).toarray()

data_cnt = {}
for i in user_ids:
    data_cnt[i] = len(rating_data[rating_data['userid'] == i])

#plt.hist(list(data_cnt.values()), bins=20)
#list(data_cnt.values())

In [5]:
user_food_score = [{} for i in range(len(user_ids))]
for u, f in zip(rating_data['mapped_userid'], rating_data['foodid']):
    if user_food_score[u].get(f) != None:
        user_food_score[u][f] += 1
    else:
        user_food_score[u][f] = 1
    
for i in range(len(user_ids)):
    all_val = list(user_food_score[i].values())
    for k, v in user_food_score[i].items():
        user_food_score[i][k] = (v - min(all_val)) / (max(all_val) - min(all_val))

In [6]:
def create_dataset(rating_data, valid_p, use_features=False):
    user_id_train = []
    user_id_valid = []
    food_id_train = []
    food_id_valid = []
    food_feature_train = []
    food_feature_valid = []
    user_feature_train = []
    user_feature_valid = []
    Y_train = []
    Y_valid = []
    np.sort(user_ids)
    for u in range(len(user_ids)):
        current_user = rating_data[rating_data['mapped_userid'] == u]
        cnt = int(len(current_user) * (1 - valid_p))
        user_id_train.extend(current_user['mapped_userid'].iloc[:cnt])
        food_id_train.extend(current_user['foodid'].iloc[:cnt])
        user_id_valid.extend(current_user['mapped_userid'].iloc[cnt:])
        food_id_valid.extend(current_user['foodid'].iloc[cnt:])
        for i in range(len(current_user)):
            if i < cnt:
                Y_train.append(user_food_score[u][current_user['foodid'].iloc[i]])
            else:
                Y_valid.append(user_food_score[u][current_user['foodid'].iloc[i]])
  
    if use_features:
        for f in food_id_train:
            food_feature_train.append(food_features[f])
        for f in food_id_valid:
            food_feature_valid.append(food_features[f])
        for u in user_id_train:
            user_feature_train.append(user_features[u])
        for u in user_id_valid:
            user_feature_valid.append(user_features[u])
    
        return  np.array(user_id_train), np.array(user_id_valid), \
                np.array(food_id_train), np.array(food_id_valid), \
                np.array(food_feature_train), np.array(food_feature_valid), \
                np.array(user_feature_train), np.array(user_feature_valid), \
                np.array(Y_train), np.array(Y_valid)
    else:
        return  np.array(user_id_train), np.array(user_id_valid), \
                np.array(food_id_train), np.array(food_id_valid), \
                np.array(Y_train), np.array(Y_valid)

In [134]:
valid_percentage = 0.0
#user_id_train, user_id_valid, food_id_train, food_id_valid, food_feature_train, food_feature_valid, user_feature_train, user_feature_valid, Y_train, Y_valid = create_dataset(rating_data, valid_percentage, True)
user_id_train, user_id_valid, food_id_train, food_id_valid, Y_train, Y_valid = create_dataset(rating_data, valid_percentage, False)

In [136]:
user_id_train.shape

(2681494,)

In [149]:
212 in list(rating_data[(rating_data.mapped_userid == 0)].foodid)

False

In [137]:
train_data = np.zeros((rating_data.foodid.max()+1, rating_data.mapped_userid.max()+1))
for i, j in zip(food_id_train, user_id_train):
    train_data[i][j] = 1

In [138]:
train_sp = csr_matrix(train_data)

In [143]:
not_eaten = {}
eaten = {}
for u in range(len(user_ids)):
    t_start_idx = np.where(user_id_train == u)[0][0]
    t_end_idx = np.where(user_id_train == u)[0][-1]
    v_start_idx = np.where(user_id_valid == u)[0][0]
    v_end_idx = np.where(user_id_valid == u)[0][-1]
    not_eaten[u] = np.setdiff1d(food_id_valid[v_start_idx: v_end_idx+1], food_id_train[t_start_idx: t_end_idx+1])
    eaten[u] = set(food_id_train[t_start_idx: t_end_idx+1])

IndexError: index 0 is out of bounds for axis 0 with size 0

In [106]:
def AveragePrecision(user_not_eaten, prediction, cnt):
    if len(user_not_eaten) == 0: return 0
    score = 0
    hit = 0
    for i, p in enumerate(prediction):
        if p in user_not_eaten:
            hit += 1
            score += hit / (i+1)
    score /= len(user_not_eaten)
    return score

In [155]:
from sklearn.metrics import mean_squared_error, r2_score
ap_valid = []
train_sp_T = train_sp.T
for it in [20]:
    start_time = time.time()
    model = implicit.als.AlternatingLeastSquares(factors=20, iterations=it, calculate_training_loss=True)
    model.fit(train_sp)
    all_score = []
    all_ans = []
    for u in range(len(user_ids)):
        #print(u)
        pred = []
        rec = [i[0] for i in model.recommend(u, train_sp_T, 2000, filter_already_liked_items=True)]
        filtered_rec = [i for i in rec if train_data[i][u] == 0]
        #print(rec)
        #print(filtered_rec)
        #print(rec)
        #print(not_eaten[u])
        #print(eaten[u])
        #print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
        #all_score.append(AveragePrecision(not_eaten[u], rec, 40))
        all_ans.append(filtered_rec[:20])
    
    #ap_valid.append(np.array(all_score).mean())
    
    with open("pred.csv", 'w') as f:
        f.write("userid,foodid\n")
        for i in range(len(user_ids)):
            f.write("{},".format(user_ids[i]))
            for j in all_ans[i]:
                f.write("{} ".format(j))
            f.write('\n')
    
    #print("iter {0}; ap: {1:.5f}; time: {2:.2f}".format(it, ap_valid[-1], time.time()-start_time))

100%|██████████| 20.0/20 [00:00<00:00, 81.57it/s, loss=0.0253]


In [154]:
np.array(all_ans).shape

(2608, 20)

In [146]:
train_data[0][212]

1.0

In [156]:
import time
#!kaggle competitions submit -c ntucsie-sdml2018-2-1 -f pred.csv -m "implicit 20"
time.sleep(5)
!kaggle competitions submissions ntucsie-sdml2018-2-1 | more

100%|████████████████████████████████████████| 206k/206k [00:09<00:00, 22.7kB/s]
Successfully submitted to NTU CSIE SDML: HW 2 - Task 1fileName               date                 description                        s
tatus    publicScore  privateScore  
---------------------  -------------------  ---------------------------------  -
-------  -----------  ------------  
pred.csv               2018-11-08 07:07:34  implicit 20                        c
omplete  0.04279      0.04375       
pred.csv               2018-11-08 04:13:36  implicit 100                       c
omplete  0.00031      0.00036       
submit.csv             2018-11-07 17:47:56  Neumf bpr                          c
omplete  0.04165      0.03986       
submit.csv             2018-11-07 15:57:44  gmf bpr                            c
omplete  0.04128      0.03977       
submit.csv             2018-11-07 15:15:17  gmf bpr                            c
omplete  0.04038      0.03920       
mDAE1.csv              2018-11-07 14:23

In [92]:
recommendations = model.recommend(800, train_sp, 8000)
np.array([i[0] for i in recommendations]).max()

5531

In [82]:
len(eaten[0])

158

## Inference

In [None]:
all_ans = []
for i, u in enumerate((user_ids)):
    ans = []
    rec = [i[0] for i in model.recommend(i, train_sp_T, 20)]
    all_score.append(AveragePrecision(not_eaten[u], ans, 20))
    all_ans.append(ans) 

In [None]:
np.array(all_score).max()

## Generate Output

In [None]:
with open("pred.csv", 'w') as f:
    f.write("userid,foodid\n")
    for i in range(len(user_ids)):
        f.write("{},".format(user_ids[i]))
        for j in all_ans[i]:
            f.write("{} ".format(j))
        f.write('\n')