In [2]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

ml_100k_data = Dataset.load_builtin('ml-100k')

svd_algorithm = SVD()

cross_validate(svd_algorithm, ml_100k_data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9349  0.9352  0.9423  0.9330  0.9361  0.9363  0.0032  
Fit time          4.88    4.93    4.60    4.48    4.95    4.77    0.19    
Test time         0.18    0.19    0.14    0.18    0.18    0.17    0.02    


{'fit_time': (4.8756654262542725,
  4.930354595184326,
  4.600804805755615,
  4.479751348495483,
  4.951127529144287),
 'test_rmse': array([0.93486585, 0.93523759, 0.94228241, 0.93301285, 0.93613072]),
 'test_time': (0.1844778060913086,
  0.18764519691467285,
  0.13855600357055664,
  0.17955732345581055,
  0.182830810546875)}

In [1]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

ml_100k_data = Dataset.load_builtin('ml-100k')

svd_algorithm = SVD()

train_set, test_set = train_test_split(ml_100k_data, test_size=.2)

svd_algorithm.fit(train_set)
predict = svd_algorithm.test(test_set)

accuracy.rmse(predict)

RMSE: 0.9418


0.941844220412939

In [24]:
import json
import os
import itertools

tracker_files_folder = 'tracker'
offers_file_names = ['observed-offers.json', 'reported-offers.json', 'viewed-offers.json']

all_offers_list = list(itertools.chain.from_iterable(
    [json.load(open(os.path.join(tracker_files_folder, file_name))) for file_name in offers_file_names]))
len(all_offers_list)

302120

In [25]:
unique_offers_set = {offer['what'] for offer in all_offers_list}
len(unique_offers_set)

90004

In [26]:
from bidict import bidict

unique_offers_bi_map = bidict({(index + 1): offer_name for index, offer_name in enumerate(unique_offers_set)})
len(unique_offers_bi_map)

90004

In [64]:
unique_offers_bi_map.inv['2016_1_17440_bzp']

22630

In [65]:
unique_offers_bi_map[22630]

'2016_1_17440_bzp'

In [7]:
import pandas as pd
# import dateutil.parser

offer_type_score_map = {
    'observed-offer': 5.0,
    'reported-offer': 2.0,
    'viewed-offer': 3.0
}

for offer in all_offers_list:
    offer['what'] = unique_offers_bi_map.inv[offer['what']]
    offer['type'] = offer_type_score_map[offer['type']]
    # offer['when'] = dateutil.parser.parse(offer['when']).timestamp()

complete_data_frame = pd.DataFrame(all_offers_list)
complete_data_frame

Unnamed: 0,score,type,what,when,who
0,,5.0,77787,2016-01-27T11:20:18.849+01:00,125
1,,5.0,85617,2016-02-08T10:14:55.426+01:00,122
2,,5.0,60482,2016-02-08T13:21:19.677+01:00,122
3,,5.0,24357,2016-09-02T09:56:19.358+02:00,248
4,,5.0,65912,2016-04-19T14:10:08.538+02:00,187
5,,5.0,6040,2016-02-09T15:30:15.257+01:00,143
6,,5.0,38629,2016-02-11T14:45:02.666+01:00,100
7,,5.0,41342,2016-02-12T12:56:07.175+01:00,149
8,,5.0,25680,2016-02-12T12:56:08.950+01:00,149
9,,5.0,40743,2015-11-04T09:19:44.950+01:00,9


In [75]:
from surprise import Dataset
from surprise import Reader

reader = Reader(rating_scale=(1, 5))

prepared_data = Dataset.load_from_df(complete_data_frame[['user_id', 'offer_id', 'score']], reader)
prepared_data

<surprise.dataset.DatasetAutoFolds at 0x230cd4c10f0>

In [76]:
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

train_set, test_set = train_test_split(prepared_data, test_size=.2)

svd_algorithm = SVD()

svd_algorithm.fit(train_set)
predict = svd_algorithm.test(test_set)

accuracy.rmse(predict)

RMSE: 0.1157


0.11566891819307137

In [77]:
svd_algorithm.predict(122, unique_offers_bi_map.inv['2016_1_12439_bzp'], verbose=True).est

user: 122        item: 85617      r_ui = None   est = 3.61   {'was_impossible': False}


In [81]:
user_id = 122
offers_for_user = []

for offer_id, offer_name in unique_offers_bi_map.items():
    estimation = svd_algorithm.predict(user_id, offer_id).est
    offers_for_user.append((offer_name, estimation))

offers_for_user.sort(key=lambda pair: pair[1], reverse=True)
offers_for_user[:10]

[('2016_S_036-057826_ted', 3.7368197780749157),
 ('2016_1_36066_bzp', 3.6933891786812634),
 ('2013_1_155005_bzp', 3.6793441603770787),
 ('2016_S_025-039860_ted', 3.6692002960856387),
 ('2016_1_17594_bzp', 3.639478975947134),
 ('2013_1_156567_bzp', 3.634500645817603),
 ('bzp-2017-n-619782', 3.6237535024642344),
 ('2015_1_286876_bzp', 3.6175929410492316),
 ('2016_1_12439_bzp', 3.6139082549001458),
 ('2015_1_18223_bzp', 3.592856808740139)]

In [23]:
l1 = json.load(open('tracker/observed-offers.json'))
l2 = json.load(open('tracker/viewed-offers.json'))
test_lists = [l1, l2]

for lst in test_lists:
    for item in lst:
        if item['who'] == 122 and item['what'] == '2016_1_12439_bzp':
            print(item)

# there may be a problem when interaction of the same user 
# with the same offer is in 2 different files

{'type': 'observed-offer', 'who': 122, 'what': '2016_1_12439_bzp', 'when': '2016-02-08T10:14:55.426+01:00'}
{'type': 'viewed-offer', 'who': 122, 'what': '2016_1_12439_bzp', 'when': '2016-02-29T08:24:14.296+01:00'}


In [40]:
test_map = {}
test_map[(122, 1)] = 3.0
test_map[(1, 5)] = 4.1

print(test_map)
print((123, 1) in test_map)

map_key = (125, 8)
for map_key in [(125, 8), (122, 1)]:
    if map_key not in test_map or test_map[map_key] < 4.0:
        test_map[map_key] = 4.0

for (x, y), z in test_map.items():
    print(x, " ", y, " ", z)

{(122, 1): 3.0, (1, 5): 4.1}
False
122   1   4.0
1   5   4.1
125   8   4.0


In [73]:
import pandas as pd

offer_type_score_map = {
    'observed-offer': 5.0,
    'reported-offer': 2.0,
    'viewed-offer': 3.0
}

unique_user_offer_map = {}

for offer in all_offers_list:
    user_id = offer['who']
    offer_id = unique_offers_bi_map.inv[offer['what']]
    score = offer_type_score_map[offer['type']]
    
    map_key = (user_id, offer_id)
    if map_key not in unique_user_offer_map or unique_user_offer_map[map_key] < score:
        unique_user_offer_map[map_key] = score
        
prepared_offers_list = [{'user_id': user_id, 'offer_id': offer_id, 'score': score} for (user_id, offer_id), score in unique_user_offer_map.items()]
complete_data_frame = pd.DataFrame(prepared_offers_list)
complete_data_frame

Unnamed: 0,offer_id,score,user_id
0,77787,5.0,125
1,85617,5.0,122
2,60482,5.0,122
3,24357,5.0,248
4,65912,5.0,187
5,6040,5.0,143
6,38629,5.0,100
7,41342,5.0,149
8,25680,5.0,149
9,40743,5.0,9
