In [None]:
import pandas as pd
from surprise import Dataset, Reader

reader = Reader(rating_scale=(1, 5))
df = pd.read_json("/home/ventus/university/data-mining/labs/data/yelp_dataset/yelp_academic_dataset_review.json", lines=True, nrows=10000)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


## a) Simple Collaborative Filtering Algorithms

In [33]:
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy


data1 = Dataset.load_from_df(df[['user_id', 'business_id', 'stars']], reader)
trainset, testset = train_test_split(data1, test_size=0.2)
model = SVD()
model.fit(trainset)
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")


RMSE: 1.3156
RMSE: 1.3155557782977223


## b) Clustering

In [34]:
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

# Load business dataset
df_business = pd.read_json("/home/ventus/university/data-mining/labs/data/yelp_dataset/yelp_academic_dataset_business.json", lines=True)

# Drop businesses without categories
df_business = df_business[df_business['categories'].notna()]

# Preprocess categories
df_business['categories'] = df_business['categories'].apply(lambda x: [i.strip() for i in x.split(',')])
category_features = MultiLabelBinarizer().fit_transform(df_business['categories'])

# Normalize numeric features
numeric_features = df_business[['stars', 'review_count']].fillna(0)
numeric_scaled = StandardScaler().fit_transform(numeric_features)

# Combine features
X = np.hstack((numeric_scaled, category_features))

n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df_business['cluster'] = kmeans.fit_predict(X)
df_clustered = df.merge(df_business[['business_id', 'cluster']], on='business_id', how='left')
df_clustered

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,cluster
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,15.0
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,19.0
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,11.0
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,11.0
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,1.0
...,...,...,...,...,...,...,...,...,...,...
9995,ZcBtCA9jGhLfakf1jJ2BAg,yab1cq5yzrTHzoyz8LYqYQ,1-z7wd860Rii4kbEMCT8DA,5,0,0,0,Excellent food and service. The place is funct...,2018-06-26 17:41:31,7.0
9996,UIkEO-10J6Y99IhRqUflvg,lYAmgL_l7A3MPFYe1DYKrw,EpREWeEpmR8f1qLHzzF0AA,5,0,1,0,Just about to get tucked into a meatloaf that ...,2018-01-09 20:26:13,4.0
9997,S-NQM3Axcg8JS3MXHUIvyw,rE2WwfgJbYfvDwBlgq__dQ,dvidzWEPgTQPeBc8CUV2OQ,5,0,0,0,Outstanding customer service! And my car is dr...,2015-04-01 21:50:28,14.0
9998,ME79YrEhm2xe4IQy_0zkGw,OnIklvzKDpk1BduC84TrTA,2XYPFRm7teCUr3eGsB2-qw,5,0,0,0,I and my husband went here for Dinner one day ...,2015-06-08 19:32:26,11.0


In [35]:
data2 = Dataset.load_from_df(df_clustered[['user_id', 'cluster', 'stars']], reader)
trainset, testset = train_test_split(data2, test_size=0.2)

model = SVD()
model.fit(trainset)
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")


RMSE: 1.2756
RMSE: 1.2755596581937316


## c)

In [36]:
# df_with_clusters contains user_id, business_id, stars, cluster
user_cluster_means = df_clustered.groupby(['user_id', 'cluster'])['stars'].mean().reset_index()
user_cluster_means.rename(columns={'stars': 'mu_uc'}, inplace=True)


In [37]:
user_cluster_means

Unnamed: 0,user_id,cluster,mu_uc
0,--4AjktZiHowEIBCMd4CZA,5.0,4.0
1,--_r6E98SNIrGU7weyNxbw,17.0,5.0
2,--pvE2eu3WWwikKs1E2QDw,4.0,5.0
3,--vCeHrklS1DIep0QhorrA,4.0,4.0
4,-0KrCHEsOcjJ6N4k_k1A9A,1.0,4.0
...,...,...,...
9877,zzT0pSbiaNAPL171kwnvjA,0.0,1.0
9878,zzZPlDQdc1-NDP6tJxc25Q,10.0,1.0
9879,zzeRzizkihWHz9bVAvbcVw,4.0,4.0
9880,zzgMuJgxmToqcJ5iu1TngQ,1.0,4.0


In [45]:
df_normalized = df_clustered.merge(user_cluster_means, on=['user_id', 'cluster'], how='left')
df_normalized['normalized_rating'] = df_normalized['stars'] - df_normalized['mu_uc']
sum(df_normalized['normalized_rating'] != 0)

162

In [46]:
data3 = Dataset.load_from_df(df_normalized[['user_id', 'business_id', 'normalized_rating']], reader)
trainset, testset = train_test_split(data2, test_size=0.2)

model = SVD()
model.fit(trainset)
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

RMSE: 1.2746
RMSE: 1.2745713815946065
