# 04 - Generate Streamlit Options

This notebook analyzes the drug review data and generates a JSON schema file for the Streamlit app.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
base_folder = "/content/drive/MyDrive/Colab Notebooks/drug_review_classification"
db_path = f"{base_folder}/data/drug_reviews.db"

In [3]:
import pandas as pd
import sqlite3
import json

# Load data from database
conn = sqlite3.connect(db_path)
df = pd.read_sql("""
    SELECT
        d.drug_name as urlDrugName,
        c.condition_name as condition,
        r.rating,
        s.side_effect_name as sideEffects,
        e.effectiveness_name as effectiveness
    FROM reviews r
    JOIN drugs d ON r.drug_id = d.drug_id
    JOIN conditions c ON r.condition_id = c.condition_id
    JOIN side_effects s ON r.side_effect_id = s.side_effect_id
    JOIN effectiveness_levels e ON r.effectiveness_id = e.effectiveness_id
""", conn)
conn.close()

print(f"Loaded {len(df)} reviews")
df.head()

Loaded 3000 reviews


Unnamed: 0,urlDrugName,condition,rating,sideEffects,effectiveness
0,Zoloft,Depression,8.0,Severe Side Effects,Moderately Effective
1,Lisinopril,Pain,9.0,Moderate Side Effects,Moderately Effective
2,Synthroid,Type 2 Diabetes,10.0,Moderate Side Effects,Moderately Effective
3,Ambien,Insomnia,9.0,Severe Side Effects,Considerably Effective
4,Zoloft,Pain,10.0,Mild Side Effects,Considerably Effective


In [4]:
# Analyze data and create schema
print("="*80)
print("ANALYZING DRUG REVIEW DATA FOR STREAMLIT APP")
print("="*80)

data_schema = {"numerical": {}, "categorical": {}}

# Numerical features
print("\n" + "-"*80)
print("NUMERICAL FEATURES")
print("-"*80)

for feature in ['rating']:
    min_val = float(df[feature].min())
    max_val = float(df[feature].max())
    mean_val = float(df[feature].mean())
    median_val = float(df[feature].median())

    data_schema["numerical"][feature] = {
        "min": min_val,
        "max": max_val,
        "mean": mean_val,
        "median": median_val
    }
    print(f"{feature}: min={min_val}, max={max_val}, mean={mean_val:.2f}, median={median_val}")

# Categorical features
print("\n" + "-"*80)
print("CATEGORICAL FEATURES")
print("-"*80)

for feature in ['urlDrugName', 'condition', 'sideEffects', 'effectiveness']:
    unique_values = df[feature].unique().tolist()
    value_counts = df[feature].value_counts().to_dict()

    data_schema["categorical"][feature] = {
        "unique_values": unique_values,
        "value_counts": value_counts
    }

    print(f"\n{feature}:")
    print(f"  Unique values: {len(unique_values)}")
    for v, c in list(value_counts.items())[:5]:
        print(f"    {v}: {c} ({c/len(df)*100:.1f}%)")
    if len(value_counts) > 5:
        print(f"    ...and {len(value_counts)-5} more")

ANALYZING DRUG REVIEW DATA FOR STREAMLIT APP

--------------------------------------------------------------------------------
NUMERICAL FEATURES
--------------------------------------------------------------------------------
rating: min=1.0, max=10.0, mean=5.55, median=6.0

--------------------------------------------------------------------------------
CATEGORICAL FEATURES
--------------------------------------------------------------------------------

urlDrugName:
  Unique values: 10
    Lipitor: 319 (10.6%)
    Ambien: 315 (10.5%)
    Metformin: 311 (10.4%)
    Synthroid: 304 (10.1%)
    Cymbalta: 298 (9.9%)
    ...and 5 more

condition:
  Unique values: 7
    High Blood Pressure: 443 (14.8%)
    Insomnia: 439 (14.6%)
    High Cholesterol: 435 (14.5%)
    Type 2 Diabetes: 424 (14.1%)
    Pain: 423 (14.1%)
    ...and 2 more

sideEffects:
  Unique values: 5
    Moderate Side Effects: 1072 (35.7%)
    Mild Side Effects: 867 (28.9%)
    Severe Side Effects: 621 (20.7%)
    No Side Ef

In [5]:
# Save schema to JSON file
output_file = f"{base_folder}/data/data_schema.json"
with open(output_file, 'w') as f:
    json.dump(data_schema, f, indent=2)

print("\n" + "="*80)
print(f"Data schema saved to: {output_file}")
print("="*80)

# Display the schema
print("\nGenerated schema:")
print(json.dumps(data_schema, indent=2)[:2000] + "...")


Data schema saved to: /content/drive/MyDrive/Colab Notebooks/drug_review_classification/data/data_schema.json

Generated schema:
{
  "numerical": {
    "rating": {
      "min": 1.0,
      "max": 10.0,
      "mean": 5.552,
      "median": 6.0
    }
  },
  "categorical": {
    "urlDrugName": {
      "unique_values": [
        "Zoloft",
        "Lisinopril",
        "Synthroid",
        "Ambien",
        "Lyrica",
        "Metformin",
        "Lexapro",
        "Prozac",
        "Lipitor",
        "Cymbalta"
      ],
      "value_counts": {
        "Lipitor": 319,
        "Ambien": 315,
        "Metformin": 311,
        "Synthroid": 304,
        "Cymbalta": 298,
        "Lyrica": 293,
        "Lexapro": 291,
        "Lisinopril": 291,
        "Prozac": 290,
        "Zoloft": 288
      }
    },
    "condition": {
      "unique_values": [
        "Depression",
        "Pain",
        "Type 2 Diabetes",
        "Insomnia",
        "High Blood Pressure",
        "Anxiety",
        "High Chol

In [6]:
print("\n" + "="*80)
print("DONE! Use data_schema.json in your Streamlit app")
print("="*80)


DONE! Use data_schema.json in your Streamlit app
