In [None]:
# call the library

# to generate data processing and visualization tools
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

# parsing and timing utilities
import ast
import time

# openAI API integration
import openai
import tiktoken
from openai import OpenAI, RateLimitError

# test processing and normalization
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer

# clustering and dimensionality reduction
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import DBSCAN

# monitoring
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
# retrieve the cleaned dataset
drug_reviews = pd.read_csv('cleaned dataset.csv')
drug_reviews.shape

In [None]:
# check for null and duplication
print("The number of missing rows are: ", drug_reviews.isnull().sum().sum())
print("The number of duplicated rows are: ", drug_reviews.duplicated().sum())

In [None]:
# fill the missing columns with blank ""
drug_reviews.fillna("", inplace=True)

# check the missing values
print("The number of missing rows are: ", drug_reviews.isnull().sum().sum())

In [None]:
drug_reviews.head()

# **Sample Filtering**

In this section, demonstrate how the **dataset is filtered** based on multiple criteria to prepare subsets for clustering analysis. The filtering process includes **thresholds on review usefulness, ratings, and minimum frequency of drugs and conditions**. This helps to focus on the most relevant and representative reviews, reducing noise and improving clustering quality.

After applying these filters, the dataset is further divided into **positive** and **negative** sentiment groups based on the review polarity scores. This division allows us to analyze patterns specific to sentiment categories.

Due to the size and complexity of the filtered datasets, applying the DBSCAN clustering algorithm on both subsets is computationally intensive and requires approximately 24 hours to complete on available hardware.


In [None]:
def filter_data(df, 
                min_useful_count=None, 
                min_ratings=None, 
                min_drug_frequency=None, 
                min_condition_frequency=None):
    
    """
    Filters the drug reviews DataFrame based on given criteria:
    - Minimum usefulCount (number of helpful votes)
    - Minimum rating threshold
    - Minimum frequency for drugs to keep
    - Minimum frequency for conditions to keep
    
    Parameters:
        df (pd.DataFrame): Input DataFrame with drug reviews.
        min_useful_count (int, optional): Minimum usefulCount to keep reviews.
        min_ratings (float or int, optional): Minimum rating to keep reviews.
        min_drug_frequency (int, optional): Minimum number of times a drug must appear.
        min_condition_frequency (int, optional): Minimum number of times a condition must appear.
        
    Returns:
        pd.DataFrame: Filtered DataFrame after applying all criteria.
    """
    
    filtered_drug_reviews = drug_reviews.copy()
    
    if min_useful_count is not None:
        filtered_drug_reviews = filtered_drug_reviews[filtered_drug_reviews['usefulCount'] >= min_useful_count]
        print(f"Filtered reviews with usefulCount >= {min_useful_count}: {filtered_drug_reviews.shape[0]} reviews")

    if min_ratings is not None:
        filtered_drug_reviews = filtered_drug_reviews[filtered_drug_reviews['rating'] > min_ratings]
        print(f"Filtered reviews with valid ratings {min_ratings}: {filtered_drug_reviews.shape[0]} reviews")
    
    if min_drug_frequency is not None:
        drug_counts = filtered_drug_reviews['drugName'].value_counts()
        frequent_drugs = drug_counts[drug_counts > min_drug_frequency].index
        filtered_drug_reviews = filtered_drug_reviews[filtered_drug_reviews['drugName'].isin(frequent_drugs)]
        print(f"Filtered reviews with drugs appearing more than {min_drug_frequency} times: {filtered_drug_reviews.shape[0]} reviews")

    if min_condition_frequency is not None:
        cond_counts = filtered_drug_reviews['condition'].value_counts()
        frequent_conds = cond_counts[cond_counts > min_condition_frequency].index
        filtered_drug_reviews = filtered_drug_reviews[filtered_drug_reviews['condition'].isin(frequent_conds)]
        print(f"Filtered reviews with conditions appearing more than {min_condition_frequency} times: {filtered_drug_reviews.shape[0]} reviews")

    return filtered_drug_reviews

In [None]:
# identify the parameters
print("usefulCount statistics:")
print(drug_reviews['usefulCount'].describe())
print()
print("Rating statistics:")
print(drug_reviews['rating'].describe())

In [None]:
filtered_drug_reviews = filter_data(drug_reviews, min_useful_count=16, min_ratings=5,
                          min_drug_frequency=1, min_condition_frequency=1)

In [None]:
print("The dimension of filtered dataset:", filtered_drug_reviews.shape)
filtered_drug_reviews.head()

In [None]:
filtered_drug_reviews['Sentiment Category'].value_counts()

In [None]:
# further divide the dataset into 2 categories - positive sentiment and negative sentiment
positive_reviews = filtered_drug_reviews[filtered_drug_reviews['Sentiment Category'] == 2]
negative_reviews = filtered_drug_reviews[filtered_drug_reviews['Sentiment Category'] == 0]

print("The dimension of positive reviews:", positive_reviews.shape)
print("The dimension of negative reviews:", negative_reviews.shape)

In [None]:
positive_reviews = positive_reviews[['drugName', 'condition', 'review', 'rating', 'usefulCount', 'side effects', 'effectiveness']]
positive_reviews = positive_reviews.reset_index(drop=True)
print("The dimension of positive reviews:", positive_reviews.shape)
positive_reviews.head()

In [None]:
negative_reviews = negative_reviews[['drugName', 'condition', 'review', 'rating', 'usefulCount', 'side effects', 'effectiveness']]
negative_reviews = negative_reviews.reset_index(drop=True)
print("The dimension of negative reviews:", negative_reviews.shape)
negative_reviews.head()

In [None]:
# download positive reviews and negative reviews
positive_reviews.to_csv('positive reviews.csv', index=False)
negative_reviews.to_csv('negative reviews.csv', index=False)

# **Using TF-IDF to Vectorize**

## **Tokenizaion**

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
# to map POS tags from nltk.pos_tag to WordNet POS tags
def get_wordnet_pos(tag):

    """
    Convert POS tag from nltk.pos_tag to a format compatible with WordNetLemmatizer.
    WordNetLemmatizer requires POS tags like ADJ, VERB, NOUN, ADV to lemmatize accurately.
    
    Args:
        tag (str): POS tag from nltk.pos_tag
    
    Returns:
        wordnet constant: corresponding WordNet POS tag
    """
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# to normalize the text
# remove punctuation, lowercase, remove stopwords, lemmatization

def normalize_text(text):
    """
    Normalize the input text by:
    - converting to lowercase
    - removing punctuation and numbers (keeping only letters and spaces)
    - tokenizing into words
    - applying POS-tag based lemmatization to retain word meanings
    
    Args:
        text (str): Raw input text to be normalized
    
    Returns:
        str: Normalized and lemmatized text as a single string
    """
    
    # lowercase
    text = text.lower()
    
    # remove punctuation and numbers
    # remove character that is not a lowercase letter or space
    text = re.sub(r'[^a-z\s]', '', text)
    
    # tokenize
    tokens = nltk.word_tokenize(text)
    
    # lemmatization
    # to hold the meanings of the words
    pos_tags = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    
    return ' '.join(tokens)

In [None]:
# to ensure the text properly transformed before normalization
# check if the text is a string or list

def process_text(text):
    """
    Process input text to ensure compatibility with normalization function.
    Supports input as string or list of strings.
    
    Args:
        text (str or list): Text input to be normalized
    
    Returns:
        str: Normalized text string
    """

    # check if the text is a string
    if isinstance(text, str):
        return normalize_text(text)
    
    # check if the text is a list
    elif isinstance(text, list):
        return normalize_text(' '.join(text))
    
    else:
        return ''

In [None]:
def check_data (df):
    """
    Perform basic data quality checks on the given DataFrame.
    
    Checks include:
    - Counting total missing values across all columns
    - Counting total duplicated rows
    - Reporting the shape (rows, columns) of the dataset
    
    Args:
        df (pandas.DataFrame): Input dataset to check
    
    Prints:
        Number of missing values, duplicated rows, and dataset dimensions.
    """
    
    # check for missing values
    print("The number of missing rows are: ", df.isnull().sum().sum())
    
    # check for duplicates
    print("The number of duplicated rows are: ", df.duplicated().sum())
    
    # check the dimension of the dataset
    print("The dimension of the dataset is: ", df.shape)

    print()

In [None]:
positive_reviews['normalized_side_effects'] = positive_reviews['side effects'].apply(process_text)
negative_reviews['normalized_side_effects'] = negative_reviews['side effects'].apply(process_text)

In [None]:
positive_reviews['normalized_side_effects'].head(5)

In [None]:
negative_reviews['normalized_side_effects'].head(5)

In [None]:
positive_reviews['normalized_effectiveness'] = positive_reviews['effectiveness'].apply(process_text)
negative_reviews['normalized_effectiveness'] = negative_reviews['effectiveness'].apply(process_text)

In [None]:
positive_reviews['normalized_effectiveness'].head()

In [None]:
negative_reviews['normalized_effectiveness'].head()

In [None]:
check_data(positive_reviews)
check_data(negative_reviews)

## **Unique Side Effects and Effectiveness Observed**

In [None]:
# tokenize the text to visualize the frequency of words for side effects and effectiveness

def tokenize(X):
    """
    Tokenize the text data by:
    - Converting to lowercase
    - Removing specific punctuation
    - Splitting text by commas
    - Removing tokens with more than 5 words (likely phrases)
    - Removing empty tokens

    Args:
        X (pd.Series): Series containing text data to tokenize
    
    Returns:
        list: List of cleaned tokens/phrases
    """
    
    # convert to lowercase
    X = X.str.lower()

    # remove punctuation
    X = X.str.replace(r"\[", "", regex=True)
    X = X.str.replace(r"\]", "", regex=True)
    X = X.str.replace(r"\(", "", regex=True)
    X = X.str.replace(r"\)", "", regex=True)
    X = X.str.replace("'", "", regex=True)

    # tokenize
    tokenize_word = []
    for i in range(len(X)):
        # split the content by comma
        word = X[i].split(',')
        # remove whitespace and skip for the phrase with more than 5 words
        word = [element.strip() for element in word if len(element.split()) <= 5]
        tokenize_word = tokenize_word + word

    # remove empty content
    tokenize_word = [x for x in tokenize_word if x != ""]

    return tokenize_word

In [None]:
def get_word_count(df):
    """
    Generate and visualize frequency counts for 'side effects' and 'effectiveness' columns.
    
    Args:
        df (pd.DataFrame): DataFrame with 'side effects' and 'effectiveness' columns
    
    Displays:
        - Counts of total and unique tokens in both categories
        - Bar plots for the top 10 tokens by frequency in each category
    """

    # tokenize the text
    side_effects = tokenize(df['side effects'])
    effectiveness = tokenize(df['effectiveness'])

    # get the word count
    print("The number of side effects: ", len(side_effects))
    print("The number of effectiveness: ", len(effectiveness))

    # turn the list into dataframe
    side_effects_df = pd.DataFrame(side_effects, columns=['side_effects'])
    effectiveness_df = pd.DataFrame(effectiveness, columns=['effectiveness'])

    # get the frequency of each word
    side_effects_count = side_effects_df['side_effects'].value_counts()
    effectiveness_count = effectiveness_df['effectiveness'].value_counts()
    print("The number of unique side effects: ", len(side_effects_count))
    print("The number of unique effectiveness: ", len(effectiveness_count))
    
    # sort the word count
    side_effect_df = side_effects_count.reset_index()
    side_effect_df.columns = ['Side Effect', 'Count']
    side_effect_df = side_effect_df.sort_values(by='Count', ascending=False)

    effectiveness_df = effectiveness_count.reset_index()
    effectiveness_df.columns = ['Effectiveness', 'Count']
    effectiveness_df = effectiveness_df.sort_values(by='Count', ascending=False)

    # plot the top 10 side effects
    fig, axes = plt.subplots(2, 1, figsize=(12, 12))

    # for first top 10 side effects found in the dataset
    sns.barplot(x='Count', y='Side Effect', data=side_effect_df.head(10), palette='viridis', ax=axes[0])
    axes[0].set_title('Top 10 Most Common Side Effects')    
    axes[0].set_xlabel('Count')
    axes[0].set_ylabel('Side Effect')

    # for first top 10 effectiveness found in the dataset
    sns.barplot(x='Count', y='Effectiveness', data=effectiveness_df.head(10), palette='coolwarm', ax=axes[1])
    axes[1].set_title('Top 10 Most Common Effectiveness')
    axes[1].set_xlabel('Count')
    axes[1].set_ylabel('Effectiveness')

    plt.tight_layout()
    plt.show()

    print()

In [None]:
get_word_count(positive_reviews)

In [None]:
get_word_count(negative_reviews)

## **Vectorization by TF-IDF**

In [None]:
def get_min_df(df, column_name):
    """
    Prints the number of features extracted with varying `min_df` values using TF-IDF vectorization.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the text data.
    - column_name (str): Name of the column containing text.

    Returns:
    - None
    """

    for min_df in [1, 5, 10, 15, 20]:
        # vectorize the text
        tfidf = TfidfVectorizer(min_df=min_df, stop_words='english', max_df = 0.95)
        tfidf_matrix = tfidf.fit_transform(df[column_name])

        # get the shape of the matrix
        print(f"min_df={min_df}: n_features={len(tfidf.vocabulary_)}")

In [None]:
def vectorize_text(df, column_name, min_df):    
    """
    Vectorizes the text in a specified column of a DataFrame using TF-IDF.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - column_name (str): The name of the column containing text data.
    - min_df (int): The minimum number of documents a term must appear in to be included.

    Returns:
    - tfidf_matrix (sparse matrix): The TF-IDF representation of the text data.
    """
    
    # vectorize the text
    tfidf = TfidfVectorizer(min_df=min_df, stop_words='english', max_df = 0.95)
    tfidf_matrix = tfidf.fit_transform(df[column_name])

    # get the shape of the matrix
    print(f"n_samples: {tfidf_matrix.shape[0]}, n_features: {tfidf_matrix.shape[1]}", '\n')

    return tfidf_matrix

In [None]:
def check_tf_idf (df, side_effect_tfidf, effectiveness_tfidf):
    """
    Checks the TF-IDF matrices and the input DataFrame for basic data integrity.

    Parameters:
    - df (pd.DataFrame): The original DataFrame.
    - side_effect_tfidf (sparse matrix): TF-IDF matrix for side effects.
    - effectiveness_tfidf (sparse matrix): TF-IDF matrix for effectiveness.

    Prints:
    - Number of missing values and duplicated rows in the DataFrame.
    - Dimensions of the side effects and effectiveness TF-IDF matrices.
    """
    
    # check for missing values
    print("The number of missing rows are: ", df.isnull().sum().sum())
    
    # check for duplicates
    print("The number of duplicated rows are: ", df.duplicated().sum())
    
    # check the dimension of the side effects
    print("The dimension of side effects tf_idf is: ", side_effect_tfidf.shape)

    # check the dimension of the effectiveness
    print("The dimension of effectiveness tf_idf is: ", effectiveness_tfidf.shape)

    print()

In [None]:
print("For side effects in positive reviews:")
get_min_df(positive_reviews, 'normalized_side_effects')
print()
print("For effectiveness in positive reviews:")
get_min_df(positive_reviews, 'normalized_effectiveness')

In [None]:
print("For side effects in negative reviews:")
get_min_df(negative_reviews, 'normalized_side_effects')
print()
print("For effectiveness in negative reviews:")
get_min_df(negative_reviews, 'normalized_effectiveness')

In [None]:
print("For side effects in positive review:")
side_effects_positive_tf_idf = vectorize_text(positive_reviews, 'normalized_side_effects', 10)

print()

print("For effectiveness in positive review:")
effectiveness_positive_tf_idf = vectorize_text(positive_reviews, 'normalized_effectiveness', 10)

In [None]:
print("For side effects in negative review:")
side_effects_negative_tf_idf = vectorize_text(negative_reviews, 'normalized_side_effects', 10)

print()

print("For effectiveness in negative review:")
effectiveness_negative_tf_idf = vectorize_text(negative_reviews, 'normalized_effectiveness', 10)

In [None]:
print("For positive reviews")
check_tf_idf(positive_reviews, side_effects_positive_tf_idf, effectiveness_positive_tf_idf)

print()

print("For negative reviews")
check_tf_idf(negative_reviews, side_effects_negative_tf_idf, effectiveness_negative_tf_idf)

## **Normalization/Scaling**

In [None]:
def normalization(tfidf_matrix):
    """
    Applies L2 normalization to the TF-IDF matrix so that each row (document vector) 
    has a unit norm. This ensures cosine similarity can be used properly in clustering or distance-based methods.

    Parameters:
    - tfidf_matrix (sparse matrix): The TF-IDF matrix to normalize.

    Returns:
    - normalized_data (sparse matrix): L2-normalized TF-IDF matrix.
    """
    
    # normalize the data
    normalizer = Normalizer()
    normalized_data = normalizer.fit_transform(tfidf_matrix)

    return normalized_data

In [None]:
positive_side_effects_scaled = normalization(side_effects_positive_tf_idf)
positive_effectiveness_scaled = normalization(effectiveness_positive_tf_idf)

print("Shape of positive side effects scaled data: ", positive_side_effects_scaled.shape)
print("Shape of positive effectiveness scaled data: ", positive_effectiveness_scaled.shape)

In [None]:
negative_side_effects_scaled = normalization(side_effects_negative_tf_idf)
negative_effectiveness_scaled = normalization(effectiveness_negative_tf_idf)

print("Shape of negative side effects scaled data: ", negative_side_effects_scaled.shape)
print("Shape of negative effectiveness scaled data: ", negative_effectiveness_scaled.shape)

## **Dimensionality Reduction**

In [None]:
def plot_explained_variance(data_scaled, name, step, max_components):
    """
    Plots the cumulative explained variance as a function of the number of components 
    for Truncated SVD (suitable for sparse data like TF-IDF matrices).

    Parameters:
    - data_scaled (sparse matrix or ndarray): The normalized or scaled feature matrix.
    - name (str): Label for the dataset or variable (used in print and plot titles).
    - step (int): Step size to increment the number of components.
    - max_components (int): Maximum number of components to consider.

    Returns:
    - None: Displays a line plot showing cumulative explained variance.
    """

    explained_variance_ratios = []

    n_components_range = list(range(1, max_components, step))

    for n in n_components_range:
        svd = TruncatedSVD(n_components=n)
        svd.fit_transform(data_scaled)
        total_variance = np.sum(svd.explained_variance_ratio_)
        explained_variance_ratios.append(total_variance)
        print(f"[{name}] Components: {n}, Cumulative Explained Variance: {total_variance:.4f}")

    plt.figure(figsize=(12, 6))
    plt.plot(n_components_range, explained_variance_ratios, marker='o')
    plt.title(f"Cumulative Explained Variance vs Number of Components for {name}")
    plt.xlabel("Number of Components")
    plt.ylabel("Cumulative Explained Variance Ratio")
    plt.ylim(0, 1)
    plt.grid(True)
    plt.show()

In [None]:
def dimensionality_reduction(data_scaled, n_components):
    """
    Performs dimensionality reduction using Truncated SVD (suitable for sparse matrices).

    Parameters:
    - data_scaled (sparse matrix or ndarray): Normalized or scaled input data (e.g., TF-IDF).
    - n_components (int): Number of dimensions to reduce the data to.

    Returns:
    - reduced_data (ndarray): Transformed dataset with reduced dimensions.
    """
    
    svd = TruncatedSVD(n_components=n_components)
    reduced_data = svd.fit_transform(data_scaled)

    return reduced_data

In [None]:
print("For Positive Reviews")
plot_explained_variance(positive_side_effects_scaled, "Positive Side Effects", 500, 2000)
print()
plot_explained_variance(positive_effectiveness_scaled, "Positive Effectiveness", 500, 3000)

In [None]:
print("For Negative Reviews")
plot_explained_variance(negative_side_effects_scaled, "Negative Side Effects", 300, 1200)
print()
plot_explained_variance(negative_effectiveness_scaled, "Negative Effectiveness", 400, 1600)

In [None]:
positive_svd_side_effects = dimensionality_reduction(positive_side_effects_scaled, 1000)
positive_svd_effectiveness = dimensionality_reduction(positive_effectiveness_scaled, 1500)
print("Shape of positive side effects SVD data: ", positive_svd_side_effects.shape)
print("Shape of positive effectiveness SVD data: ", positive_svd_effectiveness.shape)

In [None]:
negative_svd_side_effects = dimensionality_reduction(negative_side_effects_scaled, 800)
negative_svd_effectiveness = dimensionality_reduction(negative_effectiveness_scaled, 1000)
print("Shape of negative side effects SVD data: ", negative_svd_side_effects.shape)
print("Shape of negative effectiveness SVD data: ", negative_svd_effectiveness.shape)

## **DBSCAN Implementation**

In [None]:
# get the epsilon value
# using cosine because when data is high dimensional, euclidean distance is not effective
# algorithm default should be auto, but here we use brute force
# reason is that cosine distance only support by brute force

def plot_k_distance(data, k):
    """
    Plots the k-distance graph to help determine the optimal epsilon for DBSCAN.

    Parameters:
    - data: ndarray or sparse matrix
    - k_values: list of integers representing the k-nearest neighbors to test
    - metric: distance metric to use (default is 'cosine')
    """
    
    for k_values in k:
        print("k-distance for k = ", k_values)
        neigh = NearestNeighbors(n_neighbors=k_values, metric='cosine', algorithm='brute')
        neigh.fit(data)
        distances, _ = neigh.kneighbors(data)

        sorted_distances = np.sort(distances[:, -1])

        plt.figure(figsize=(12, 6))
        plt.plot(sorted_distances)
        plt.xlabel("Points sorted by distance")
        plt.ylabel(f"{k_values}-th Nearest Neighbor Distance")
        plt.title("k-Distance Graph to Find Optimal Eps")
        plt.grid()
        plt.show()

In [None]:
def tune_dbscan(X, eps_values, min_samples_values):
    """
    Tunes DBSCAN parameters and returns the best configuration based on DBI.

    Parameters:
    - X: ndarray or sparse matrix
    - eps_values: list of epsilon values
    - min_samples_values: list of min_samples values
    - metric: distance metric for DBSCAN (default is 'cosine')

    Returns:
    - best_eps: best epsilon value found
    - best_score: lowest DBI score
    - best_min_samples: corresponding min_samples value
    """

    best_eps = None
    best_score = float('inf')
    best_min_samples = None

    for min_samples in min_samples_values:
        for eps in eps_values:
            print(f"Testing min_samples = {min_samples}, eps = {eps}")
            dbscan = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
            labels = dbscan.fit_predict(X)

            print("Done Clustering")

            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            if n_clusters >= 2:

                # exclude noise points
                sil_score = silhouette_score(X[labels!= -1], labels[labels!=-1])
                print(f"Silhouette Coefficients: {sil_score:.4f}")
                db_index = davies_bouldin_score(X[labels!= -1], labels[labels!=-1])
                print(f"DBI: {db_index:.4f}")
                
                if db_index < best_score:
                    best_score = db_index
                    best_eps = eps
                    best_min_samples = min_samples
            
                # how many clusters formed
                print("Clustered labels: ", n_clusters)
                # how many number of noise points
                print("Noise points: ", list(labels).count(-1))
            
            else:
                print("No clusters formed or all points are noise.")
            
        print()

    return best_eps, best_score, best_min_samples

In [None]:
def cluster_and_insert_labels(df, features, eps, min_samples, label_column_name):
    """
    Applies DBSCAN clustering and inserts the cluster labels into the DataFrame.

    Parameters:
    - df: original pandas DataFrame
    - features: features to cluster on (e.g., reduced TF-IDF)
    - eps: epsilon value for DBSCAN
    - min_samples: minimum samples for DBSCAN
    - label_column_name: name of the column to store cluster labels
    - metric: distance metric for DBSCAN (default is 'cosine')

    Returns:
    - df: updated DataFrame with cluster labels
    """

    dbscan = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
    labels = dbscan.fit_predict(features)

    # insert labels into the original DataFrame
    df[label_column_name] = labels

    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    if n_clusters >= 2:

        # exclude noise points
        sil_score = silhouette_score(features[labels!=-1], labels[labels!=-1])
        print(f"Silhouette Coefficients: {sil_score:.4f}")
        db_index = davies_bouldin_score(features[labels!=-1], labels[labels!=-1])
        print(f"DBI: {db_index:.4f}")

        # how many clusters formed
        print("Clustered labels: ", len(set(labels)) - 1)
        # how many number of noise points
        print("Noise points: ", list(labels).count(-1))
        
    else:
        print(f"[{label_column_name}] Too few clusters to compute DBI")

    return df

### **Positive Side Effects**

In [None]:
plot_k_distance(positive_svd_side_effects, k = [10, 100, 300])

In [None]:
tune_dbscan(positive_svd_side_effects, eps_values = [0.1, 0.2, 0.3, 0.6, 0.75, 0.85], min_samples_values = [10, 100, 300])

In [None]:
tune_dbscan(positive_svd_side_effects, eps_values = [0.4, 0.5], min_samples_values = [100])

In [None]:
# select eps with 0.1 and min_samples with 100
# compact and well-separated clusters with relatively low dbi

positive_reviews = cluster_and_insert_labels(
    df=positive_reviews,
    features=positive_svd_side_effects,
    eps=0.1,
    min_samples=100,
    label_column_name='side_effects_labels'
)

### **Positive Effectiveness**

In [None]:
plot_k_distance(positive_svd_effectiveness, k = [10, 100, 300])

In [None]:
tune_dbscan(positive_svd_effectiveness, eps_values = [0.2, 0.4, 0.5, 0.7, 0.8, 0.85], min_samples_values = [10, 100, 300])

In [None]:
tune_dbscan(positive_svd_effectiveness, eps_values = [0.25, 0.3, 0.35], min_samples_values = [20, 30, 50])

In [None]:
positive_reviews = cluster_and_insert_labels(
    df=positive_reviews,
    features=positive_svd_effectiveness,
    eps=0.3,
    min_samples=20,
    label_column_name='effectiveness_labels'
)

# positive_reviews.to_csv('positive reviews with labels.csv', index=False)

### **Negative Side Effects**

In [None]:
plot_k_distance(negative_svd_side_effects, k = [10, 100, 200])

In [None]:
tune_dbscan(negative_svd_side_effects, eps_values = [0.15, 0.4, 0.5, 0.7, 0.85], min_samples_values = [10, 100, 200])

In [None]:
tune_dbscan(negative_svd_side_effects, eps_values = [0.2, 0.3, 0.4, 0.5], min_samples_values = [50, 70])

In [None]:
# use eps = 0.22 and min_samples = 30
negative_reviews = cluster_and_insert_labels(
    df=negative_reviews,
    features=negative_svd_side_effects,
    eps=0.5,
    min_samples=50,
    label_column_name='side_effects_labels'
)

### **Negative Effectiveness**

In [None]:
plot_k_distance(negative_svd_effectiveness, k = [10, 100, 200])

In [None]:
tune_dbscan(negative_svd_effectiveness, eps_values = [0.2, 0.5, 0.6, 0.7, 0.85], min_samples_values = [10, 100, 200])

In [None]:
tune_dbscan(negative_svd_effectiveness, eps_values = [0.5, 0.55, 0.6, 0.65], min_samples_values = [5, 10, 15, 20])

In [None]:
negative_reviews = cluster_and_insert_labels(
    df=negative_reviews,
    features=negative_svd_effectiveness,
    eps=0.5,
    min_samples=10,
    label_column_name='effectiveness_labels'
)

# negative_reviews.to_csv('negative reviews with labels.csv', index=False)

## **Join Two Dataset Together**

In [None]:
# to ensure the text is in list format
def safe_parse_list(x):
    """
    Safely parses a string representation of a list into a Python list.
    
    Parameters:
    - x: Input value that may be a string representation of a list.
    
    Returns:
    - A parsed list if successful.
    - A list containing the original string if parsing fails.
    - An empty list if the input is not a string.
    """
    
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return [x]
    return []

In [None]:
# rearrange the columns
positive_reviews = positive_reviews[['drugName', 'condition', 'review', 'rating', 'usefulCount', 'side effects', 'side_effects_labels', 'effectiveness', 'effectiveness_labels']]
negative_reviews = negative_reviews[['drugName', 'condition', 'review', 'rating', 'usefulCount', 'side effects', 'side_effects_labels', 'effectiveness', 'effectiveness_labels']]

In [None]:
# add the sentiment column
positive_reviews['sentiment'] = "Positive"
negative_reviews['sentiment'] = "Negative"

In [None]:
# join the positive and negative reviews
drug_reviews = pd.concat([positive_reviews, negative_reviews], ignore_index=True)
drug_reviews = drug_reviews.reset_index(drop=True)

# drop "review" column
drug_reviews = drug_reviews.drop(columns=['review'])
print("The dimension of drug reviews:", drug_reviews.shape)
drug_reviews.head()

In [None]:
drug_reviews['side effects'] = drug_reviews['side effects'].apply(
    lambda x: ' '.join(str(item) for item in safe_parse_list(x))
)

drug_reviews['effectiveness'] = drug_reviews['effectiveness'].apply(
    lambda x: ' '.join(str(item) for item in safe_parse_list(x))
)

drug_reviews['side effects'] = drug_reviews['side effects'].replace('', 'no side effects reported')
drug_reviews['effectiveness'] = drug_reviews['effectiveness'].replace('', 'no effectiveness information')

In [None]:
drug_reviews.isnull().sum()

In [None]:
# download the dataset
drug_reviews.to_csv('drug reviews with labels.csv', index=False)

# **Text Embedding with OpenAI ada-002**

In [None]:
# to ensure the text is in list format
def safe_parse_list(x):
    """
    Safely parses a string representation of a list into a Python list.
    
    Parameters:
    - x: Input value that may be a string representation of a list.
    
    Returns:
    - A parsed list if successful.
    - A list containing the original string if parsing fails.
    - An empty list if the input is not a string.
    """
    
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return [x]
    return []

In [None]:
positive_reviews = pd.read_csv("positive reviews.csv")
print("The dimension of positive reviews:", positive_reviews.shape)
positive_reviews.head()

In [None]:
positive_reviews['side effects'] = positive_reviews['side effects'].apply(
    lambda x: ' '.join(str(item) for item in safe_parse_list(x))
)

positive_reviews['effectiveness'] = positive_reviews['effectiveness'].apply(
    lambda x: ' '.join(str(item) for item in safe_parse_list(x))
)

positive_reviews['side effects'] = positive_reviews['side effects'].replace('', 'no side effects reported')
positive_reviews['effectiveness'] = positive_reviews['effectiveness'].replace('', 'no effectiveness information')

In [None]:
positive_reviews.head()

In [None]:
negative_reviews = pd.read_csv("negative reviews.csv")
print("The dimension of positive reviews:", negative_reviews.shape)
negative_reviews.head()

In [None]:
negative_reviews['side effects'] = negative_reviews['side effects'].apply(
    lambda x: ' '.join(str(item) for item in safe_parse_list(x))
)

negative_reviews['effectiveness'] = negative_reviews['effectiveness'].apply(
    lambda x: ' '.join(str(item) for item in safe_parse_list(x))
)

negative_reviews['side effects'] = negative_reviews['side effects'].replace('', 'no side effects reported')
negative_reviews['effectiveness'] = negative_reviews['effectiveness'].replace('', 'no effectiveness information')

In [None]:
negative_reviews.head()

In [None]:
print("The dimension of positive reviews:", positive_reviews.shape)
print("The missing values in positive reviews:", positive_reviews.isnull().sum().sum())

print("The dimension of negative reviews:", negative_reviews.shape)
print("The missing values in negative reviews:", negative_reviews.isnull().sum().sum())

## **Call API**

In [None]:
! pip install --upgrade openai
! pip install tiktoken

In [None]:
# call API key
client = OpenAI(api_key="API_KEY")

## **Tiktoken**

In [None]:
# Load the tokenizer for the model
tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")

# Function to count tokens in a text
def count_tokens(text):
    """
    Count the number of tokens in a given text using the tokenizer.

    Parameters:
    - text (str): The input text to tokenize.

    Returns:
    - int: Number of tokens in the input text.
    """
    
    # Encode the text to get the token IDs
    tokens = tokenizer.encode(text)

    # Return the number of tokens
    # as v2-ada model has a limit of 8191 tokens
    return len(tokens) <= 8191

In [None]:
# filter rows where both columns meet the token limit
positive_reviews = positive_reviews[positive_reviews['side effects'].apply(count_tokens)].reset_index(drop=True)
positive_reviews = positive_reviews[positive_reviews['effectiveness'].apply(count_tokens)].reset_index(drop=True)

In [None]:
print("The dimension of positive reviews:", positive_reviews.shape)
positive_reviews.head()

In [None]:
# filter rows where both columns meet the token limit
negative_reviews = negative_reviews[negative_reviews['side effects'].apply(count_tokens)].reset_index(drop=True)
negative_reviews = negative_reviews[negative_reviews['effectiveness'].apply(count_tokens)].reset_index(drop=True)

In [None]:
print("The dimension of negative reviews:", negative_reviews.shape)
negative_reviews.head()

## **Text Embedding**

In [None]:
! pip install tqdm

In [None]:
# text embedding
def get_embedding(text):
    """
    Generate an embedding vector for the given text using OpenAI's embedding API.

    Parameters:
    - text (str): Input text to embed.

    Returns:
    - list[float] or None: Embedding vector as a list of floats, or None if an error occurs.
    """

    try:
        response = client.embeddings.create(
            input=text,
            model="text-embedding-ada-002"
        )

        # store the embedding
        embeded = response.data[0].embedding

        return response.data[0].embedding
    
    except RateLimitError:
        print("Rate limit hit. Sleeping for 10 seconds...")
        time.sleep(10)
        return get_embedding(text)
    
    except Exception as e:
        print(f"Embedding error: {e}")
        return None

In [None]:
positive_reviews['side effects embedding'] = positive_reviews['side effects'].progress_apply(get_embedding)

In [None]:
positive_reviews['effectiveness embedding'] = positive_reviews['effectiveness'].progress_apply(get_embedding)

In [None]:
positive_reviews.head()

In [None]:
# get embeddings for side effects
negative_reviews['side effects embedding'] = negative_reviews['side effects'].progress_apply(get_embedding)

In [None]:
# get embeddings for effectiveness
negative_reviews['effectiveness embedding'] = negative_reviews['effectiveness'].progress_apply(get_embedding)

In [None]:
negative_reviews.head()

## **Process Text Embedding for Future Use**
As text embedding return as the string data type, hence it is not suitable for model developement. <br>
Therefore, string data type will be converted into list and further transform into array.

In [None]:
# convert the string representation of lists back to actual lists
positive_reviews['side effects embedding'] = positive_reviews['side effects embedding'].progress_apply(ast.literal_eval)

In [None]:
positive_reviews['effectiveness embedding'] = positive_reviews['effectiveness embedding'].progress_apply(ast.literal_eval)

In [None]:
negative_reviews['side effects embedding'] = negative_reviews['side effects embedding'].progress_apply(ast.literal_eval)

In [None]:
negative_reviews['effectiveness embedding'] = negative_reviews['effectiveness embedding'].progress_apply(ast.literal_eval)

In [None]:
positive_side_effects = np.vstack(positive_reviews['side effects embedding'].values)
positive_effectiveness = np.vstack(positive_reviews['effectiveness embedding'].values)

negative_side_effects = np.vstack(negative_reviews['side effects embedding'].values)
negative_effectiveness = np.vstack(negative_reviews['effectiveness embedding'].values)

In [None]:
print("For Positive Reviews: ")
print(f"For side effects: {positive_side_effects.shape}")
print(positive_side_effects)
print()
print(f"For effectiveness: {positive_effectiveness.shape}")
print(positive_effectiveness)

print()
print("--" * 20)
print()

print("For Negative Reviews: ")
print(f"For side effects: {negative_side_effects.shape}")
print(negative_side_effects)
print()
print(f"For effectiveness: {negative_effectiveness.shape}")
print(negative_effectiveness)

## **DBSCAN Implementation**

In [None]:
# reduced dimensionality
def plot_explained_variance(data_scaled, name, step, max_components):
    """
    Plot cumulative explained variance ratio vs. number of components using TruncatedSVD.

    Parameters:
    - data_scaled: array-like, shape (n_samples, n_features)
        The normalized data matrix.
    - name: str
        Name identifier for the dataset (used in plot titles and print statements).
    - step: int
        Step size for the number of components to test.
    - max_components: int
        Maximum number of components to test.
    """

    explained_variance_ratios = []

    n_components_range = list(range(1, max_components, step))

    for n in n_components_range:
        svd = TruncatedSVD(n_components=n)
        svd.fit_transform(data_scaled)
        total_variance = np.sum(svd.explained_variance_ratio_)
        explained_variance_ratios.append(total_variance)
        print(f"[{name}] Components: {n}, Cumulative Explained Variance: {total_variance:.4f}")

    plt.figure(figsize=(12, 6))
    plt.plot(n_components_range, explained_variance_ratios, marker='o')
    plt.title(f"Cumulative Explained Variance vs Number of Components for {name}")
    plt.xlabel("Number of Components")
    plt.ylabel("Cumulative Explained Variance Ratio")
    plt.ylim(0, 1)
    plt.grid(True)
    plt.show()

In [None]:
def dimensionality_reduction(data_scaled, n_components):
    """
    Perform dimensionality reduction using TruncatedSVD.

    Parameters:
    - data_scaled: array-like, shape (n_samples, n_features)
        The normalized data matrix.
    - n_components: int
        Number of components to keep.

    Returns:
    - reduced_data: array-like, shape (n_samples, n_components)
        Dimensionally reduced data.
    """
    
    svd = TruncatedSVD(n_components=n_components)
    reduced_data = svd.fit_transform(data_scaled)
    print(f"Reduced data shape: {reduced_data.shape}")

    return reduced_data

In [None]:
def plot_k_distance(data, k):
    """
    Plot k-distance graphs for different values of k to help determine DBSCAN's epsilon.

    Parameters:
    - data: array-like, shape (n_samples, n_features)
        Data for which k-distance is computed.
    - k: list of int
        List of k values (number of nearest neighbors) to plot.
    """
    
    for k_values in k:
        print("k-distance for k = ", k_values)
        neigh = NearestNeighbors(n_neighbors=k_values, metric='cosine', algorithm='brute')
        neigh.fit(data)
        distances, _ = neigh.kneighbors(data)

        sorted_distances = np.sort(distances[:, -1])

        plt.figure(figsize=(12, 6))
        plt.plot(sorted_distances)
        plt.xlabel("Points sorted by distance")
        plt.ylabel(f"{k_values}-th Nearest Neighbor Distance")
        plt.title("k-Distance Graph to Find Optimal Eps")
        plt.grid()
        plt.show()

In [None]:
# note that, it is acceptable for dbi between 1-2
# because text data normally carry with high dimensions
# silhouette score less preferable in text data

def tune_dbscan(X, eps_values, min_samples_values):
    """
    Tune DBSCAN parameters eps and min_samples by evaluating clustering quality.

    Parameters:
    - X: array-like, shape (n_samples, n_features)
        Feature matrix for clustering.
    - eps_values: iterable of float
        List or range of epsilon values to try.
    - min_samples_values: iterable of int
        List or range of min_samples values to try.

    Returns:
    - best_eps: float
        The epsilon value that yields the best Davies-Bouldin Index.
    - best_score: float
        The best Davies-Bouldin Index found.
    - best_min_samples: int
        The min_samples value that yields the best Davies-Bouldin Index.
    """

    best_eps = None
    best_score = float('inf')
    best_min_samples = None

    for min_samples in min_samples_values:
        for eps in eps_values:
            print(f"Testing min_samples = {min_samples}, eps = {eps}")
            dbscan = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1, metric= "cosine")
            labels = dbscan.fit_predict(X)

            print("Done Clustering")

            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            noise_points = list(labels).count(-1)

            if n_clusters >= 2:

                # exclude noise points
                sil_score = silhouette_score(X[labels!= -1], labels[labels!=-1])
                print(f"Silhouette Coefficients: {sil_score:.4f}")
                db_index = davies_bouldin_score(X[labels!= -1], labels[labels!=-1])
                print(f"DBI: {db_index:.4f}")
                    
                if db_index < best_score:
                    best_score = db_index
                    best_eps = eps
                    best_min_samples = min_samples
                
                # how many clusters formed
                print("Clustered labels: ", n_clusters)
                # how many number of noise points
                print("Noise points: ", noise_points)
                
            else:
                print("No clusters formed or all points are noise.")
            
            print()

    return best_eps, best_score, best_min_samples

In [None]:
def cluster_and_insert_labels(df, features, eps, min_samples, label_column_name):
    """
    Apply DBSCAN clustering and insert cluster labels into the dataframe.

    Parameters:
    - df: pandas.DataFrame
        Original dataframe to insert cluster labels.
    - features: array-like, shape (n_samples, n_features)
        Feature matrix used for clustering.
    - eps: float
        Epsilon parameter for DBSCAN.
    - min_samples: int
        Minimum samples parameter for DBSCAN.
    - label_column_name: str
        Name of the column to store cluster labels in the dataframe.

    Returns:
    - df: pandas.DataFrame
        DataFrame with added cluster labels.
    """

    dbscan = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1, metric='cosine')
    labels = dbscan.fit_predict(features)

    # insert labels into the original DataFrame
    df[label_column_name] = labels

    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    if n_clusters >= 2:

        # exclude noise points
        sil_score = silhouette_score(features[labels!=-1], labels[labels!=-1])
        print(f"Silhouette Coefficients: {sil_score:.4f}")
        db_index = davies_bouldin_score(features[labels!=-1], labels[labels!=-1])
        print(f"DBI: {db_index:.4f}")

        # how many clusters formed
        print("Clustered labels: ", len(set(labels)) - 1)
        # how many number of noise points
        print("Noise points: ", list(labels).count(-1))
        
    else:
        print(f"[{label_column_name}] Too few clusters to compute DBI")

    return df

### **Positive Side Effects**

In [None]:
plot_explained_variance(positive_side_effects, "Positive Side Effects", 200, 1600)

In [None]:
positive_side_effects = dimensionality_reduction(positive_side_effects, 600)

In [None]:
plot_k_distance(positive_side_effects, k = [10, 50, 100])

In [None]:
tune_dbscan(positive_side_effects, eps_values = [0.025, 0.05, 0.075, 0.1, 0.15, 0.175, 0.2], min_samples_values = [10, 50, 100])

In [None]:
tune_dbscan(positive_side_effects, eps_values = [0.025, 0.03, 0.035, 0.04], min_samples_values = [80, 90, 100])

In [None]:
# as min_samples=100 and eps=0.025 works better 
positive_reviews = cluster_and_insert_labels(
    df=positive_reviews,
    features=positive_side_effects,
    eps=0.025,
    min_samples=100,
    label_column_name='side effects labels'
)

print("The dimension of positive reviews:", positive_reviews.shape)
positive_reviews.head()

### **Positive Effectiveness**

In [None]:
plot_explained_variance(positive_side_effects, "Positive Side Effects", 200, 1600)

In [None]:
positive_effectiveness = dimensionality_reduction(positive_effectiveness, 600)

In [None]:
plot_k_distance(positive_effectiveness, k = [10, 50, 100])

In [None]:
tune_dbscan(positive_effectiveness, eps_values = [0.05, 0.075, 0.095, 0.15, 0.175, 0.2], min_samples_values = [10, 50, 100])

In [None]:
tune_dbscan(positive_effectiveness, eps_values = [0.04, 0.045, 0.05, 0.055, 0.06], min_samples_values = [20, 30, 40])

In [None]:
# as min_samples=40 and eps=0.04 works better
positive_reviews = cluster_and_insert_labels(
    df=positive_reviews,
    features=positive_effectiveness,
    eps=0.04,
    min_samples=40,
    label_column_name='effectiveness labels'
)

print("The dimension of positive reviews:", positive_reviews.shape)
positive_reviews.head()

### **Negative Side Effects**

In [None]:
plot_explained_variance(negative_side_effects, "Negative Side Effects", 200, 1600)

In [None]:
negative_side_effects = dimensionality_reduction(negative_side_effects, 600)

In [None]:
plot_k_distance(negative_side_effects, k = [10, 50, 100])

In [None]:
tune_dbscan(negative_side_effects, eps_values = [0.025, 0.05, 0.075, 0.1, 0.15, 0.175, 0.2], min_samples_values = [10, 50, 100])

In [None]:
tune_dbscan(negative_side_effects, eps_values = [0.025, 0.03, 0.035, 0.04, 0.045], min_samples_values = [30, 70])

In [None]:
negative_reviews = cluster_and_insert_labels(
    df=negative_reviews,
    features=negative_side_effects,
    eps=0.025,
    min_samples=30,
    label_column_name='side effects labels'
)

print("The dimension of negative reviews:", negative_reviews.shape)
negative_reviews.head()

### **Negative Effectiveness**

In [None]:
plot_explained_variance(negative_effectiveness, "Negative Effectiveness", 200, 1600)

In [None]:
negative_effectiveness = dimensionality_reduction(negative_effectiveness, 650)

In [None]:
plot_k_distance(negative_effectiveness, k = [10, 50, 100])

In [None]:
tune_dbscan(negative_effectiveness, eps_values = [0.05, 0.075, 0.1, 0.1125, 0.15, 0.175, 0.2, 0.225], min_samples_values = [10, 50, 100])

In [None]:
tune_dbscan(negative_effectiveness, eps_values = [0.04, 0.045, 0.05, 0.055, 0.06, 0.065], min_samples_values = [20, 30, 40, 50])

In [None]:
negative_reviews = cluster_and_insert_labels(
    df=negative_reviews,
    features=negative_effectiveness,
    eps=0.055,
    min_samples=50,
    label_column_name='effectiveness labels'
)

print("The dimension of negative reviews:", negative_reviews.shape)
negative_reviews.head()

## **Join Two Dataset Together**


In [None]:
positive_reviews['sentiment'] = "Positive"
negative_reviews['sentiment'] = "Negative"
print("The dimension of positive reviews:", positive_reviews.shape)
print("The dimension of negative reviews:", negative_reviews.shape)

In [None]:
print(positive_reviews.isnull().sum().sum())
print(negative_reviews.isnull().sum().sum())

In [None]:
positive_reviews.head()

In [None]:
negative_reviews.head()

In [None]:
# join the positive and negative reviews
drug_reviews = pd.concat([positive_reviews, negative_reviews], ignore_index=True)
drug_reviews = drug_reviews.reset_index(drop=True)

# drop "review" column
drug_reviews = drug_reviews.drop(columns=['review'])
print("The dimension of drug reviews:", drug_reviews.shape)
drug_reviews.head()

In [None]:
drug_reviews.isnull().sum().sum()

In [None]:
# download the dataset
drug_reviews.to_csv('drug reviews embedding with labels.csv', index=False)