In [None]:
# call the library

# to generate data processing and visualization tools
import matplotlib.pyplot as plt
import seaborn as sns

# text processing and visualization
import nltk
import string
from wordcloud import WordCloud
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# **Fetch Dataset From UCI Machine Learning Repository**

The coding on fetching dataset was retrieved from: <br>
Kallumadi, S. & Grer, F. (2018). Drug Reviews (Drugs.com) [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C5SK5S.

<br>

The dataset consists of **215063** drug reviews from patient. <br>
The dataset consists of the features **drugName, condition, review, rating, date, usefulCount**.

In [None]:
# install the ucimlrepo package to access UCI Machine Learning Repository datasets
! pip install ucimlrepo

In [None]:
# Kallumadi, S. & Grer, F. (2018).
# Drug Reviews (Drugs.com) [Dataset].
# UCI Machine Learning Repository. https://doi.org/10.24432/C5SK5S.

from ucimlrepo import fetch_ucirepo

# fetch dataset
drug_reviews_drugs_com = fetch_ucirepo(id=462)

In [None]:
# get data
df = drug_reviews_drugs_com.data.features

# view dataset
df.head()

# **Exploratory Data Analysis**

In this section, **explore the distribution and characteristics** of the dataset, including patient ratings, usefulness of reviews, health conditions, and drug names. This helps in **identifying patterns, outliers, and overall trends** before proceeding to deeper analysis.

<br>

Additionally, **perform text processing and normalization** on the review texts to prepare the data for sentiment analysis. This involves cleaning the text, removing noise, and extracting sentiment polarity and categories. The processed sentiments will be used to **generate insightful visualizations** such as word clouds, highlighting key themes in positive, neutral, and negative reviews.



In [None]:
# get dimension of dataset
print(df.shape)

# get the number of reviews and features
print(f"The dataset consists of",df.shape[0], "drug reviews from patient.")
print(f"The dataset consists of the features",', '.join(df.columns))

## **Numerical Data Type Exploration**

**1. Rating Distribution** <br>
The **rating value was starting from 1 to 10** in which 1 represented the patient was dissatisfied with the drug meanwhile 10 represented the patient was satisfied with the drug. From the figure below, the **rating 10 showed highest frequency** indicated that majority of patients had the better drug experience. <br>

**2. Useful Count Distribution** <br>
The usefulCount column represents **the number of people who found a review helpful**. Although the boxplot shows many values as outliers due to their high counts, these **outliers are real and valid data points** rather than errors. They indicate particularly influential or high-quality reviews that many users found useful. Therefore, these outliers reflect the true distribution of user engagement and should be interpreted as valuable insights rather than anomalies.

In [None]:
# count the occurrences of rating
counts = df['rating'].value_counts()

# histogram for rating
plt.figure(figsize=(10, 6))
plt.bar(counts.index, counts.values, color='blue', edgecolor='black')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title(f'Histogram of Rating')
plt.xticks(counts.index)
plt.show()

In [None]:
# find outliers for usefulCount
sns.boxplot(y=df['usefulCount'])
plt.title('Boxplot of usefulCount')
plt.ylabel('usefulCount')
plt.show()

# try to replace outliers with mean and median
# but the distribution will be different from ori one, have 2 peaks

In [None]:
# scatter plot between 'rating' and 'usefulCount'
plt.figure(figsize=(10, 6))
plt.scatter(df['rating'], df['usefulCount'], alpha=0.5)

plt.title('Scatter Plot of Rating vs Useful Count')
plt.xlabel('Rating')
plt.ylabel('Useful Count')
rating_count = sorted(df['rating'].unique())
plt.xticks(rating_count)

# Show the plot
plt.show()

## **Object Data Type Exploration**

**1. Condition Distribution**
There are **total of 916 conditions** that had been involved in the dataset. The conditions represented as the **specific health issues** that the drug being used by the patient. Among 916 conditions, birth control achieved the highest frequency at 38436.<br>

**2. Drug Name Distribution**
There are **total of 3671 drugs** that had been involved in the dataset. The drug represented as the **drug being used** by the patient. Among 3671 drugs, levonorgestrel achieved the highest frequency at 4930

In [None]:
# count the occurrences of condition
counts = df['condition'].value_counts()

# get the top 10 most frequent conditions
top_10_conditions = counts.head(10)

# plot the bar chart for the top 10 conditions
top_10_conditions.plot(kind='bar', figsize=(12, 6), color='blue', edgecolor='black')
plt.title('Frequency of Conditions')
plt.xlabel('Condition')
plt.ylabel('Count')
plt.show()

In [None]:
# count the occurrences of condition
counts = df['condition'].value_counts()

# get the least 10 most frequent conditions
least_10_conditions = counts.tail(10)

# plot the bar chart for the top 10 conditions
least_10_conditions.plot(kind='bar', figsize=(12, 6), color='blue', edgecolor='black')
plt.title('Frequency of Conditions')
plt.xlabel('Least 10 Condition')
plt.ylabel('Count')
plt.show()

In [None]:
# identify the condition with only one frequency
one_frequency_conditions = counts[counts == 1]

# obtained only the condition without value counts
one_frequency_conditions = one_frequency_conditions.index.tolist()
print("The number of conditions with only one count: ", len(one_frequency_conditions))
print(one_frequency_conditions)

In [None]:
# count the occurrences of drugName
counts = df['drugName'].value_counts()

# get the top 10 most frequent conditions
top_10_drugs = counts.head(10)

# plot the bar chart for the top 10 conditions
top_10_drugs.plot(kind='bar', figsize=(12, 6), color='blue', edgecolor='black')
plt.title('Frequency of Drug')
plt.xlabel('Drug')
plt.ylabel('Count')
plt.show()

In [None]:
# get the least 10 frequent conditions
least_10_drugs = counts.tail(10)

# plot the bar chart for the top 10 conditions
least_10_drugs.plot(kind='bar', figsize=(12, 6), color='blue', edgecolor='black')
plt.title('Frequency of Drug')
plt.xlabel('Least 10 Drug')
plt.ylabel('Count')
plt.show()

In [None]:
# identify the condition with only one frequency
one_frequency_drugName = counts[counts == 1]

# obtained only the condition without value counts
one_frequency_drugName = one_frequency_drugName.index.tolist()
print("The number of conditions with only one count: ", len(one_frequency_drugName))
print(one_frequency_drugName)

## **Textual Data Exploration**

In [None]:
# downloade resource
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# initialize the tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

### **Text Processing and Normalization**

In [None]:
# text processing function to clean and normalize review text
def text_processing(text):

    """
    Clean and preprocess raw review text using NLP best practices:
    - Lowercasing
    - Tokenization
    - Punctuation removal
    - Stopword removal
    - Stemming
    
    Args:
        text (str): Raw text input
    
    Returns:
        str: Preprocessed, cleaned text
    """

    # lowercase
    text = text.lower()

    # tokenization
    tokens = word_tokenize(text)

    # punctuation removal
    filtered = [word for word in tokens if word not in string.punctuation]

    # stopwords removal
    filtered_sw = [word for word in filtered if word not in stop_words]

    # stemming
    tokens = [stemmer.stem(word) for word in filtered_sw]

    return " ".join(tokens)

In [None]:
# to get sentiment polarity using TextBlob
def getPolarity(review_text):
    """
    Compute sentiment polarity of a given text.
    Returns a float in range [-1.0, 1.0]:
    -1.0 (very negative), 0.0 (neutral), 1.0 (very positive)

    Args:
        review_text (str): The review text
    
    Returns:
        float: Sentiment polarity score
    """
    
    return TextBlob(review_text).sentiment.polarity

In [None]:
# to categorize polarity score into a sentiment label
def getAnalysis(score):
    """
    Categorize sentiment polarity score into descriptive labels.

    Args:
        score (float): Sentiment polarity score from TextBlob

    Returns:
        str: One of ['Negative', 'Neutral', 'Positive']
    """
    if score < 0:
        return "Negative"
    elif score == 0:
        return "Neutral"
    else:
        return "Positive"

In [None]:
df['cleaned review'] = df['review'].apply(text_processing)
df.head()

In [None]:
df['TextBlob_Polarity'] = df['cleaned review'].apply(getPolarity)
df.head()

In [None]:
df["Sentiment Category"] = df['TextBlob_Polarity'].apply(getAnalysis)
df.head()

### **Generate Word Cloud**

In [None]:
# assign the reviews based on sentiment
positive_reviews = ' '.join(df[df['Sentiment Category'] == 'Positive']['cleaned review'])
negative_reviews = ' '.join(df[df['Sentiment Category'] == 'Negative']['cleaned review'])
neutral_reviews = ' '.join(df[df['Sentiment Category'] == 'Neutral']['cleaned review'])

# split into words
positive_words = positive_reviews.split()
negative_words = negative_reviews.split()
neutral_words = neutral_reviews.split()

In [None]:
# find common words across all categories
positive_word_set = set(positive_words)
negative_word_set = set(negative_words)
neutral_word_set = set(neutral_words)

common_words = positive_word_set & negative_word_set & neutral_word_set

# remove common words from each category
positive_words = [word for word in positive_words if word not in common_words]
negative_words = [word for word in negative_words if word not in common_words]
neutral_words = [word for word in neutral_words if word not in common_words]

In [None]:
# join the words back
positive_words = ' '.join(positive_words)
negative_words = ' '.join(negative_words)
neutral_words = ' '.join(neutral_words)

# generate word clouds
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_words)
wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(negative_words)
wordcloud_neutral = WordCloud(width=800, height=400, background_color='white').generate(neutral_words)

# positive
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.axis('off')
plt.title('Positive Reviews', fontsize=16)

# negative
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.axis('off')
plt.title('Negative Reviews', fontsize=16)

# neutral
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_neutral, interpolation='bilinear')
plt.axis('off')
plt.title('Neutral Reviews', fontsize=16)

plt.show()