In [45]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

ratingsdf = pd.read_csv("data/rating.csv", sep='|')
userdf = pd.read_csv("data/user.csv", sep='|')
venuedf = pd.read_csv("data/venue.csv", sep='|')

ratingsdf.loc[ratingsdf['venueid'] == 5]

Unnamed: 0,venueid,userid,score,time,comment
2379,5,2161,4.0,0.2,น้ำตก น้ำใสไหลเย็น เหมาะกับการท่องเที่ยวเชิงธร...
2380,5,2162,4.0,0.8,จอดรถเดินไป 2 นาทีถึง แต่ชั้นบนไม่ได้เดินขึ้นไ...
2381,5,277,5.0,1.0,น้ำตกสวยงามมาก ถ้าจะมาชมความงามแต่ไท่เล่นน้ำ ม...
2382,5,329,3.0,2.0,26 สิงหาคม น้ำใหลแรงครับสวยมากๆแต่ถ้าช่วงไหนฝน...
2383,5,1,3.0,3.0,เป็นธารน้ำตกเล็กๆไหลลงมาจากภูเขา แนะนำให้มาช่ว...
...,...,...,...,...,...
2484,5,2009,4.0,4.0,
2485,5,2232,5.0,5.0,
2486,5,2233,4.0,2.0,
2487,5,2011,5.0,5.0,


In [39]:
n_ratings = len(ratingsdf)
n_venues = len(ratingsdf['venueid'].unique())
n_users = len(ratingsdf['userid'].unique())

print(f"Number of ratings: {n_ratings}")
print(f"Number of unique venue's: {n_venues} / {len(venuedf)}")
print(f"Number of unique users: {n_users} / {len(userdf)}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per venue: {round(n_ratings/n_venues, 2)}")

# Find Lowest and Highest rated venue:
mean_rating = ratingsdf.groupby('venueid')[['score']].mean()
# Lowest rated venue
lowest_rated = mean_rating['score'].idxmin()
venuedf.loc[venuedf['venueid'] == lowest_rated]
# Highest rated venue
highest_rated = mean_rating['score'].idxmax()
venuedf.loc[venuedf['venueid'] == highest_rated]
# show number of people who rated venue rated venue highest
ratingsdf[ratingsdf['venueid']==highest_rated]
# show number of people who rated venue rated venue lowest
ratingsdf[ratingsdf['venueid']==lowest_rated]
## the above venue has very low dataset. We will use bayesian average
venue_stats = ratingsdf.groupby('venueid')[['score']].agg(['count', 'mean'])
venue_stats.columns = venue_stats.columns.droplevel()

Number of ratings: 9483
Number of unique venue's: 18 / 18
Number of unique users: 7313 / 7313
Average ratings per user: 1.3
Average ratings per venue: 526.83


In [19]:
N = len(ratingsdf['userid'].unique())
M = len(ratingsdf['venueid'].unique())

k=10

# Map Ids to indices
user_mapper = dict(zip(np.unique(ratingsdf["userid"]), list(range(N))))
venue_mapper = dict(zip(np.unique(ratingsdf["venueid"]), list(range(M))))

# Map indices to IDs
user_inv_mapper = dict(zip(list(range(N)), np.unique(ratingsdf["userid"])))
venue_inv_mapper = dict(zip(list(range(M)), np.unique(ratingsdf["venueid"])))

user_index = [user_mapper[i] for i in ratingsdf['userid']]
venue_index = [venue_mapper[i] for i in ratingsdf['venueid']]

# create user-item matrix using scipy csr matrix
venueMatrix = csr_matrix((ratingsdf["score"], (venue_index, user_index)), shape=(M, N))
userMatrix = csr_matrix((ratingsdf['score'], (user_index, venue_index)), shape=(N, M))

# target venue
venues_name = dict(zip(venuedf['venueid'], venuedf['name']))  
venue_id = 3

# target user
users_name = dict(zip(userdf['userid'], userdf['name']))
user_id = 3

neighbour_venue_ids = []
neighbour_user_ids = []
        
venue_ind = venue_mapper[venue_id]
user_ind = user_mapper[user_id]
#get target venue from matrix
venue_vec = venueMatrix[venue_ind]
user_vec = userMatrix[user_ind]

k+=1

# find similarity of venue
kNN_v = NearestNeighbors(n_neighbors=k, algorithm="brute", metric='cosine')
kNN_v.fit(venueMatrix)

kNN_u = NearestNeighbors(n_neighbors=k, algorithm="brute", metric='cosine')
kNN_u.fit(userMatrix)

venue_vec = venue_vec.reshape(1, -1)
user_vec = user_vec.reshape(1, -1)

neighbour_of_venue = kNN_v.kneighbors(venue_vec, return_distance=False)
neighbour_of_user = kNN_u.kneighbors(user_vec, return_distance=False)

for i in range(0, k):
    n = neighbour_of_venue.item(i)
    neighbour_venue_ids.append(venue_inv_mapper[n])
neighbour_venue_ids.pop(0)

for i in range(0, k):
    n = neighbour_of_user.item(i)
    neighbour_user_ids.append(user_inv_mapper[n])
neighbour_user_ids.pop(0)

# print
similar_ids = neighbour_venue_ids
venue_name = venues_name[venue_id]
print(f"{k-1} nearlest of venue {venue_name}")
for i in similar_ids:
    print(str(i) + " " + venues_name[i])
print("------------------------------------------------------------------------")
similar_ids = neighbour_user_ids
user_name = users_name[user_id]
print(f"{k-1} nearlest of user {user_name}")
for i in similar_ids:
    print(str(i) + " " +users_name[i])
print("------------------------------------------------------------------------")


10 nearlest of venue ถนนคนเดิน หนองคาย
14 วัดโพธิ์ชัย
15 ศาลาแก้วกู่@หนองคาย
12 วัดศรีชมภูองค์ตื้อ
11 วัดพระธาตุบังพวน
18 อนุสาวรีย์ปราบฮ่อ
16 สกายวอร์ค วัดผาตากเสื้อ
10 วัดถ้ำศรีมงคล
7 พิพิธภัณฑ์สัตว์น้ำจังหวัดหนองคาย
17 สะพานมิตรภาพไทย-ลาว แห่งที่ 1
2 ตลาดอินโดจีน
------------------------------------------------------------------------
10 nearlest of user Tuk Phuengthai
608 Pichit Raetai
613 Pat T
612 Pradit Mettaworakun
611 Rampai Ketjirachot
610 32 พรชรินทร์ ไพราม
615 รัตนา นุ่มคง
604 Prapas Sopa
606 JITTI SOMMANA
616 Somchok Plammanas
605 Pakinee Wisasawon
------------------------------------------------------------------------
