In [5]:
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
df = pd.read_csv('data/tripadvisor_review.csv')
df.head()

Unnamed: 0,User ID,Category 1,Category 2,Category 3,Category 4,Category 5,Category 6,Category 7,Category 8,Category 9,Category 10
0,User 1,0.93,1.8,2.29,0.62,0.8,2.42,3.19,2.79,1.82,2.42
1,User 2,1.02,2.2,2.66,0.64,1.42,3.18,3.21,2.63,1.86,2.32
2,User 3,1.22,0.8,0.54,0.53,0.24,1.54,3.18,2.8,1.31,2.5
3,User 4,0.45,1.8,0.29,0.57,0.46,1.52,3.18,2.96,1.57,2.86
4,User 5,0.51,1.2,1.18,0.57,1.54,2.02,3.18,2.78,1.18,2.54


In [7]:
# rename columns as their short names
cols_dict = {
	"Category 1": "AG", # art galleries
	"Category 2": "DC", # dance clubs
	"Category 3": "JB", # juice bars
	"Category 4": "RS", # restaurants
	"Category 5": "MU", # museums
	"Category 6": "RE", # resorts
	"Category 7": "PP", # parks/picnic spots
	"Category 8": "BE", # beaches
	"Category 9": "TH", # theaters
	"Category 10": "RI", # religious institutions
}

df.rename(columns=cols_dict, inplace=True)
df.head()

Unnamed: 0,User ID,AG,DC,JB,RS,MU,RE,PP,BE,TH,RI
0,User 1,0.93,1.8,2.29,0.62,0.8,2.42,3.19,2.79,1.82,2.42
1,User 2,1.02,2.2,2.66,0.64,1.42,3.18,3.21,2.63,1.86,2.32
2,User 3,1.22,0.8,0.54,0.53,0.24,1.54,3.18,2.8,1.31,2.5
3,User 4,0.45,1.8,0.29,0.57,0.46,1.52,3.18,2.96,1.57,2.86
4,User 5,0.51,1.2,1.18,0.57,1.54,2.02,3.18,2.78,1.18,2.54


In [8]:
# drop the first column
df.drop(columns=['User ID'], inplace=True)
df.head()

Unnamed: 0,AG,DC,JB,RS,MU,RE,PP,BE,TH,RI
0,0.93,1.8,2.29,0.62,0.8,2.42,3.19,2.79,1.82,2.42
1,1.02,2.2,2.66,0.64,1.42,3.18,3.21,2.63,1.86,2.32
2,1.22,0.8,0.54,0.53,0.24,1.54,3.18,2.8,1.31,2.5
3,0.45,1.8,0.29,0.57,0.46,1.52,3.18,2.96,1.57,2.86
4,0.51,1.2,1.18,0.57,1.54,2.02,3.18,2.78,1.18,2.54


In [11]:
# apply k-means clustering
kmeans = KMeans(n_clusters=10, random_state=0)
kmeans_labels = kmeans.fit_predict(df)
kmeans_labels



array([5, 6, 7, 0, 1, 4, 4, 4, 6, 2, 1, 0, 5, 5, 5, 6, 8, 5, 1, 5, 1, 3,
       4, 4, 1, 6, 5, 8, 4, 5, 5, 6, 6, 3, 2, 8, 8, 4, 5, 7, 3, 4, 3, 0,
       0, 7, 2, 8, 0, 5, 3, 5, 3, 8, 4, 4, 5, 8, 5, 1, 5, 7, 4, 4, 7, 7,
       8, 4, 8, 4, 7, 5, 0, 5, 1, 6, 8, 5, 4, 3, 7, 8, 4, 7, 7, 0, 7, 4,
       9, 0, 7, 4, 6, 5, 6, 7, 5, 8, 5, 2, 3, 4, 6, 4, 7, 5, 1, 4, 7, 6,
       2, 8, 0, 8, 5, 3, 6, 3, 8, 8, 6, 3, 6, 7, 1, 4, 4, 8, 8, 3, 8, 1,
       6, 1, 2, 1, 6, 1, 3, 4, 3, 4, 0, 2, 2, 0, 3, 5, 1, 1, 3, 3, 4, 0,
       3, 8, 3, 7, 6, 8, 8, 4, 0, 8, 8, 4, 5, 2, 5, 8, 3, 7, 4, 4, 3, 0,
       3, 8, 3, 4, 4, 1, 1, 8, 5, 3, 3, 7, 1, 4, 5, 5, 7, 2, 3, 4, 8, 1,
       8, 9, 5, 3, 5, 4, 8, 4, 3, 4, 8, 8, 8, 5, 0, 4, 4, 1, 4, 4, 1, 5,
       3, 8, 6, 4, 1, 0, 9, 8, 0, 5, 2, 5, 4, 0, 3, 3, 1, 0, 1, 7, 5, 6,
       2, 2, 2, 8, 7, 9, 7, 3, 1, 4, 5, 5, 0, 2, 4, 9, 7, 1, 1, 1, 7, 6,
       4, 1, 5, 6, 2, 1, 3, 8, 5, 5, 9, 4, 8, 3, 4, 3, 8, 3, 5, 1, 3, 4,
       9, 2, 8, 3, 5, 4, 4, 7, 4, 8, 3, 8, 7, 0, 4,

In [12]:
# use silhouette score to evaluate the clustering
silhouette_score(df, kmeans_labels)

0.16722722270020568

In [13]:
# apply agglomerative clustering
agg = AgglomerativeClustering(n_clusters=10)
agg_labels = agg.fit_predict(df)
agg_labels

array([9, 9, 1, 7, 0, 1, 1, 1, 3, 8, 0, 2, 5, 9, 9, 3, 4, 3, 0, 5, 4, 2,
       1, 1, 0, 5, 9, 4, 7, 9, 9, 5, 3, 2, 1, 4, 4, 1, 5, 1, 2, 7, 4, 2,
       2, 1, 8, 4, 2, 5, 2, 5, 8, 0, 1, 7, 9, 1, 9, 0, 9, 1, 7, 7, 2, 1,
       4, 1, 4, 4, 1, 5, 2, 5, 0, 3, 1, 9, 4, 2, 1, 0, 1, 4, 0, 2, 1, 7,
       6, 4, 1, 7, 9, 9, 3, 1, 5, 0, 5, 8, 1, 7, 3, 7, 1, 5, 0, 1, 0, 3,
       8, 4, 2, 4, 5, 2, 3, 8, 4, 4, 3, 2, 3, 0, 4, 1, 7, 3, 2, 1, 4, 0,
       3, 0, 8, 0, 3, 0, 1, 1, 2, 1, 2, 1, 1, 2, 2, 5, 0, 0, 2, 2, 7, 2,
       2, 4, 2, 1, 3, 4, 4, 7, 2, 4, 4, 1, 5, 0, 5, 4, 8, 1, 7, 1, 1, 2,
       2, 4, 1, 1, 7, 0, 0, 4, 9, 2, 2, 1, 0, 1, 5, 5, 1, 2, 8, 1, 4, 9,
       4, 6, 9, 2, 5, 1, 4, 7, 2, 1, 4, 1, 4, 5, 2, 7, 7, 0, 7, 4, 0, 5,
       2, 4, 3, 1, 9, 2, 6, 4, 2, 5, 1, 0, 7, 7, 0, 8, 0, 2, 0, 1, 5, 3,
       8, 8, 7, 4, 1, 6, 1, 8, 0, 1, 5, 5, 7, 2, 1, 6, 1, 0, 0, 0, 1, 9,
       1, 0, 9, 3, 8, 0, 2, 4, 0, 9, 6, 1, 3, 8, 7, 2, 4, 4, 5, 0, 2, 7,
       6, 8, 4, 2, 9, 1, 1, 4, 7, 1, 1, 4, 0, 2, 7,

In [14]:
# use silhouette score to evaluate the clustering
silhouette_score(df, agg_labels)

0.10444811795627763

In [15]:
# apply DBSCAN clustering
dbscan = DBSCAN(eps=1, min_samples=5)
dbscan_labels = dbscan.fit_predict(df)
dbscan_labels

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [16]:
# use silhouette score to evaluate the clustering
silhouette_score(df, dbscan_labels)

0.3387970749425855