# 04 - Generate Streamlit Options

This notebook analyzes the drug review data and generates a JSON schema file for the Streamlit app.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
base_folder = "/content/drive/MyDrive/Colab Notebooks/drug_review_classification"
db_path = f"{base_folder}/data/drug_reviews.db"

In [None]:
import pandas as pd
import sqlite3
import json

# Load data from database
conn = sqlite3.connect(db_path)
df = pd.read_sql("""
    SELECT
        d.drug_name as urlDrugName,
        c.condition_name as condition,
        r.rating,
        s.side_effect_name as sideEffects,
        e.effectiveness_name as effectiveness
    FROM reviews r
    JOIN drugs d ON r.drug_id = d.drug_id
    JOIN conditions c ON r.condition_id = c.condition_id
    JOIN side_effects s ON r.side_effect_id = s.side_effect_id
    JOIN effectiveness_levels e ON r.effectiveness_id = e.effectiveness_id
""", conn)
conn.close()

print(f"Loaded {len(df)} reviews")
df.head()

In [None]:
# Analyze data and create schema
print("="*80)
print("ANALYZING DRUG REVIEW DATA FOR STREAMLIT APP")
print("="*80)

data_schema = {"numerical": {}, "categorical": {}}

# Numerical features
print("\n" + "-"*80)
print("NUMERICAL FEATURES")
print("-"*80)

for feature in ['rating']:
    min_val = float(df[feature].min())
    max_val = float(df[feature].max())
    mean_val = float(df[feature].mean())
    median_val = float(df[feature].median())

    data_schema["numerical"][feature] = {
        "min": min_val,
        "max": max_val,
        "mean": mean_val,
        "median": median_val
    }
    print(f"{feature}: min={min_val}, max={max_val}, mean={mean_val:.2f}, median={median_val}")

# Categorical features
print("\n" + "-"*80)
print("CATEGORICAL FEATURES")
print("-"*80)

for feature in ['urlDrugName', 'condition', 'sideEffects', 'effectiveness']:
    unique_values = df[feature].unique().tolist()
    value_counts = df[feature].value_counts().to_dict()

    data_schema["categorical"][feature] = {
        "unique_values": unique_values,
        "value_counts": value_counts
    }

    print(f"\n{feature}:")
    print(f"  Unique values: {len(unique_values)}")
    for v, c in list(value_counts.items())[:5]:
        print(f"    {v}: {c} ({c/len(df)*100:.1f}%)")
    if len(value_counts) > 5:
        print(f"    ...and {len(value_counts)-5} more")

In [None]:
# Save schema to JSON file
output_file = f"{base_folder}/data/data_schema.json"
with open(output_file, 'w') as f:
    json.dump(data_schema, f, indent=2)

print("\n" + "="*80)
print(f"Data schema saved to: {output_file}")
print("="*80)

# Display the schema
print("\nGenerated schema:")
print(json.dumps(data_schema, indent=2)[:2000] + "...")

In [None]:
print("\n" + "="*80)
print("DONE! Use data_schema.json in your Streamlit app")
print("="*80)