In [None]:
# call the library

# to generate data processing and visualization tools
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

# clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
drug_reviews_drugs_com = fetch_ucirepo(id=462)

In [None]:
# get data
df = drug_reviews_drugs_com.data.features

# view dataset
df.head()

# **Data Preparation**

During the data preparation, several steps will be carried out to retrieve a cleaned dataset for further analysis. <br>
The steps are:
1. Identify Missing Value
2. Remove Duplicates
3. Convert Date Format
4. Lowercasing the Text

## **Handling Missing Value**

In [None]:
# check for missing values
df.isnull().sum()

In [None]:
# the uniqueness of condition
len(df.condition.unique())

In [None]:
# the most presence condition in the dataset
df.condition.mode()

In [None]:
# replace missing values with "No Specified"
# reason: have a clear flag for analysis
# and not confident that drug performance is reflecting the most occurrence condition
df.condition.fillna('Not Specified', inplace=True)

## **Handling Irrelevant Data**

In [None]:
# as shown in the EDA, condition consists of irrelavent data
# replace with Not Specified
# the common word: </span> users found this comment helpful
# using regex to search for the words then replace with 'Not Specified'
df['condition'] = df['condition'].replace(r'\d+</span> users found this comment helpful\.', 'Not Specified', regex=True)

In [None]:
# check if the "Not Specified" was filled to the missing condition column
df[df['condition'] == 'Not Specified'].head()

In [None]:
# check if missing values still exist
df.isnull().sum()

## **Handling Duplicates**

In [None]:
# check for duplicates
df.duplicated().sum()

In [None]:
# print out the duplicates
duplicates = df.duplicated()
df[duplicates]

In [None]:
# drop the duplicates
df = df.drop_duplicates()
df.head()

In [None]:
# check for duplicates
df.duplicated().sum()

In [None]:
# get dimension of dataset
print(df.shape)

print(f"The dataset consists of",df.shape[0], "drug reviews from patient.")
print(f"The dataset consists of the features",', '.join(df.columns))

## **Lowercase Column Name**

In [None]:
# lowercase the text
df['drugName'] = df['drugName'].str.lower()
df['condition'] = df['condition'].str.lower()
df['review'] = df['review'].str.lower()

In [None]:
print('The dimension of cleaned dataset:', df.shape)
df.head()

In [None]:
# download csv file
df.to_csv('cleaned dataset.csv', index=False)

# **Best Clustering Approach Used**

There are three most popular clustering approaches such as KMeans, Hierarchical Clustering and DBSCAN.
In order to determine which clustering approach is the best for this experiment, a few steps had been taken. <br>

**Two steps to determine are:**
1. Outlier Detection
2. Compare Performance

## **Outlier Detection**

PCA diagram showed that **majority of data points were packed together** in the central region while there **exist with some data points that is spread away** from the central region. These data points which distribute from central region can be considered as the outliers.

In [None]:
outlier_detection_df = pd.read_csv('cleaned dataset.csv')
outlier_detection_df.head(3)

In [None]:
outlier_detection_df.shape

In [None]:
# vectorize reviews using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(outlier_detection_df['review'])

In [None]:
tfidf_matrix.shape

In [None]:
# reduce dimensionality
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# visualize results for outlier determination
plt.figure(figsize=(8, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1])
plt.title('PCA with 2 Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

## **Clustering Techniques Performance**

DBSCAN outperformed k-means and agglomerative hierarchical. These initial findings provide an initial study for determining the clustering approach that will be applied after data derivation.

In [None]:
cluster_df = outlier_detection_df.copy()

# vectorize reviews using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(cluster_df['cleaned review'])

In [None]:
# random sampling to get subset of data
# obtained 10% data from dataset

# get number of rows
n_rows = tfidf_matrix.shape[0]

# 10% percent of data
sample_size = int(0.1 * n_rows)

# random select rows from dataset
random_indices = np.random.choice(n_rows, size=sample_size, replace=False)

# subset of data
random_data = tfidf_matrix[random_indices]
random_data.shape

In [None]:
# store the silhouette scores
# find the optimal number of clusters using silhouette score

silhouette_scores = []

# apply kmeans
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(random_data)
    score = silhouette_score(random_data, kmeans.labels_)
    silhouette_scores.append(score)

# polt
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.title(f'Silhouette Score')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()

# the highest score
optimal_k = range(2, 11)[silhouette_scores.index(max(silhouette_scores))]
print(f"Optimal number of clusters: {optimal_k}")

In [None]:
# kmeans clustering
kmeans = KMeans(n_clusters=9)
kmeans_labels = kmeans.fit_predict(random_data)

# evaluate clustering
sil_score_kmeans = silhouette_score(random_data, kmeans_labels)
print(f"Silhouette Score for K-Means: {sil_score_kmeans}")

In [None]:
# agglomerative clustering
agglo = AgglomerativeClustering(n_clusters=9)
agglo_labels = agglo.fit_predict(random_data.toarray())

# evaluate clustering
sil_score_agglo = silhouette_score(random_data, agglo_labels)
print(f"Silhouette Score for Agglomerative Clustering: {sil_score_agglo}")

In [None]:
# dbscan clustering
from sklearn.metrics import silhouette_score

# store the best parameters
best_eps = None
best_min_samples = None
best_score = -1

# range to iterate
eps_values = [0.1, 0.5, 1.0, 1.3, 1.5]
min_samples_values = [10, 50, 100, 500, 1000]


for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan_labels = dbscan.fit_predict(random_data)

        num_clusters = len(np.unique(dbscan_labels))
        if num_clusters > 1:
            # calculate silhouette score
            score = silhouette_score(random_data, dbscan_labels)

            if score > best_score:
                best_score = score
                best_eps = eps
                best_min_samples = min_samples

# show results
print(f"Best Silhouette Score: {best_score}")
print(f"Best eps: {best_eps}")
print(f"Best min_samples: {best_min_samples}")