In [1]:
# Connect to DB

In [2]:
import sqlite3
import pandas as pd
import numpy as np
from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise import NMF
from surprise.model_selection import train_test_split
import haversine as hs

In [3]:
conn = sqlite3.connect('/home/vikash/scb-test/fsdata.db/fsdata.db')

# For the top 10 users (with more checkins) build: 
## Problem 1 (A basket of recommendation : venues(places))

In [4]:
#For this problem will apply collaborative filtering to learn the ratings for movies.
#Leared rating will then be used for recommendation.
#Will use matrix factorization approach to learn ratings.

In [5]:
# Get all ratings of venues

In [6]:
venue_ratings=pd.read_sql_query("select user_id,venue_id,rating from ratings", conn)

In [7]:
venue_ratings

Unnamed: 0,user_id,venue_id,rating
0,1,1,5
1,1,51,4
2,1,51,2
3,1,51,5
4,1,52,5
...,...,...,...
2809575,2153498,91385,2
2809576,2153499,783,2
2809577,2153500,91385,2
2809578,2153501,68691,2


In [8]:
#min rating
min_rating=min(venue_ratings.rating)

In [9]:
#max rating
max_rating=max(venue_ratings.rating)

In [10]:
# Top 10 users with most checkins
checkins=venue_ratings.groupby(['user_id']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [11]:
top_10_checkins=list(checkins[:10].user_id)

In [13]:
top_10_checkins

[30200, 54953, 103224, 281, 4442, 79082, 41460, 56474, 61219, 46040]

In [12]:
string_ints = [str(i) for i in top_10_checkins]

str_of_ints = ",".join(string_ints)

In [140]:
#User locations
user_locations = pd.read_sql_query("select id,latitude,longitude from users where id IN ({})".format(str_of_ints), conn)

In [171]:
user_locations=user_locations.set_index('id')

In [159]:
#Venue Locations
venue_locations = pd.read_sql_query("select id,latitude,longitude from venues",conn)

In [185]:
#User distance threshold is max distance between user and venue visited
distance_threshold=dict()
for user in top_10_checkins:
    lat_u=user_locations.loc[user].latitude
    lng_u=user_locations.loc[user].longitude

    venues=list(venue_ratings[venue_ratings['user_id']==281].venue_id)
    max_dist=0
    for v in venues:
        venue_locations.loc[v]
        lat_v=venue_locations.loc[v].latitude
        lng_v=venue_locations.loc[v].longitude
        dist=hs.haversine((lat_u,lng_u),(lat_v,lng_v))
        if dist > max_dist:
            max_dist=dist

    distance_threshold[user]=max_dist

In [177]:
venue_ratings

Unnamed: 0,user_id,venue_id,rating
0,1,1,5
1,1,51,4
2,1,51,2
3,1,51,5
4,1,52,5
...,...,...,...
2809575,2153498,91385,2
2809576,2153499,783,2
2809577,2153500,91385,2
2809578,2153501,68691,2


In [172]:
venue_locations=venue_locations.set_index('id')

In [24]:
#reader
reader = Reader(rating_scale=(min_rating, max_rating))

In [25]:
# The columns  user id, venue_idand ratings .
data = Dataset.load_from_df(venue_ratings[['user_id', 'venue_id', 'rating']], reader)

In [30]:
# find the best model 
# svd
algo_svd = SVD()

# Run 5-fold cross-validation and then print results
cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# nmf
algo_nmf = NMF()
# Run 5-fold cross-validation and then print results
cross_validate(algo_nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7160  0.7145  0.7149  0.7144  0.7148  0.7149  0.0006  
MAE (testset)     0.5463  0.5454  0.5455  0.5449  0.5453  0.5455  0.0005  
Fit time          97.78   99.42   99.52   100.34  99.46   99.30   0.83    
Test time         3.87    3.80    3.75    3.75    4.18    3.87    0.16    
Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0928  1.0936  1.0937  1.0938  1.0931  1.0934  0.0004  
MAE (testset)     0.8799  0.8804  0.8813  0.8807  0.8800  0.8805  0.0005  
Fit time          222.69  217.06  216.92  218.45  218.61  218.74  2.09    
Test time         7.13    3.70    7.18    3.54    6.94    5.70    1.70    


{'test_rmse': array([1.092752  , 1.0936046 , 1.09369929, 1.09375522, 1.09308604]),
 'test_mae': array([0.87988164, 0.88037258, 0.88132809, 0.88074617, 0.88001901]),
 'fit_time': (222.68828773498535,
  217.0562241077423,
  216.91613388061523,
  218.4487874507904,
  218.61452746391296),
 'test_time': (7.130717515945435,
  3.70239520072937,
  7.175108909606934,
  3.535029649734497,
  6.942584037780762)}

In [31]:
#SVD has lower RMSE so let's choose SVD

In [34]:
#Fit on whole dataset
model=algo_svd.fit(data.build_full_trainset())

In [208]:
from collections import defaultdict
#get top 10 predictions
def get_top_n_geofenced(predictions, n=10,d_threshold=0.0,venue_locations=None,user_locations=None):
    # First map the predictions to each user.

    if d_threshold == 0.0:
        # No Geo Fencing Required
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n
    
    else:
        
        # Threshold based prediction.Predictions are geo-fenced based on user chekin history
        top_n = defaultdict(list)
        
        for uid, iid, true_r, est, _ in predictions:
            lat_u=user_locations.loc[uid].latitude
            lng_u=user_locations.loc[uid].longitude
            
            lat_v=venue_locations.loc[iid].latitude
            lng_v=venue_locations.loc[iid].longitude

            dist = hs.haversine((lat_v,lng_v),(lat_u,lng_u))
           
            if dist <= d_threshold:
                top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n

In [201]:
#make recommendations
all_venues = venue_ratings['venue_id'].unique()

In [212]:
predictions_list = []
for user_id in top_10_checkins:

    rated_venues = venue_ratings.loc[venue_ratings['user_id']==user_id,'venue_id'].unique()
    unrated_venues = np.setdiff1d(all_venues,rated_venues)
    test_set = [[user_id,venue_id,2.] for venue_id in unrated_venues]
    predictions=model.test(test_set)
    predictions_list.append(get_top_n_geofenced(predictions,10,distance_threshold[user_id],venue_locations,user_locations))