# 01 - Create Drug Review Database

This notebook creates a **normalized 3NF SQLite database** from the Drug Reviews dataset.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Set base folder
base_folder = "/content/drive/MyDrive/Colab Notebooks/drug_review_classification"

import os
os.makedirs(base_folder, exist_ok=True)
os.makedirs(f"{base_folder}/data", exist_ok=True)
os.makedirs(f"{base_folder}/models", exist_ok=True)
os.makedirs(f"{base_folder}/datasets", exist_ok=True)
print(f"Base folder: {base_folder}")

Base folder: /content/drive/MyDrive/Colab Notebooks/drug_review_classification


In [3]:
import pandas as pd
import numpy as np
import sqlite3
from pathlib import Path

# Download instructions
print("Download the Drug Reviews dataset from:")
print("https://archive.ics.uci.edu/dataset/461/drug+review+dataset+druglib+com")
print(f"\nPlace drugLibTrain_raw.tsv and drugLibTest_raw.tsv in: {base_folder}/datasets")

Download the Drug Reviews dataset from:
https://archive.ics.uci.edu/dataset/461/drug+review+dataset+druglib+com

Place drugLibTrain_raw.tsv and drugLibTest_raw.tsv in: /content/drive/MyDrive/Colab Notebooks/drug_review_classification/datasets


In [4]:
# Load data (or create sample if files not available)
datasets_path = f"{base_folder}/datasets"
train_path = f"{datasets_path}/drugLibTrain_raw.tsv"
test_path = f"{datasets_path}/drugLibTest_raw.tsv"

if os.path.exists(train_path) and os.path.exists(test_path):
    df_train = pd.read_csv(train_path, sep='\t')
    df_test = pd.read_csv(test_path, sep='\t')
    df_train['split'] = 'train'
    df_test['split'] = 'test'
    df = pd.concat([df_train, df_test], ignore_index=True)
    print(f"Loaded {len(df)} reviews")
else:
    print("Creating sample data...")
    np.random.seed(42)
    n = 3000
    drugs = ['Lipitor', 'Prozac', 'Metformin', 'Lisinopril', 'Ambien', 'Lexapro', 'Zoloft', 'Synthroid', 'Cymbalta', 'Lyrica']
    conditions = ['Depression', 'Anxiety', 'High Blood Pressure', 'Type 2 Diabetes', 'Insomnia', 'Pain', 'High Cholesterol']
    side_effects = ['No Side Effects', 'Mild Side Effects', 'Moderate Side Effects', 'Severe Side Effects', 'Extremely Severe Side Effects']
    effectiveness = ['Ineffective', 'Marginally Effective', 'Moderately Effective', 'Considerably Effective', 'Highly Effective']

    df = pd.DataFrame({
        'urlDrugName': np.random.choice(drugs, n),
        'condition': np.random.choice(conditions, n),
        'benefitsReview': ['This medication helped me significantly.'] * n,
        'sideEffectsReview': ['Some minor side effects initially.'] * n,
        'commentsReview': ['Overall satisfied with this treatment.'] * n,
        'rating': np.random.randint(1, 11, n).astype(float),
        'sideEffects': np.random.choice(side_effects, n, p=[0.1, 0.3, 0.35, 0.2, 0.05]),
        'effectiveness': np.random.choice(effectiveness, n, p=[0.1, 0.15, 0.25, 0.3, 0.2]),
        'split': np.random.choice(['train', 'test'], n, p=[0.8, 0.2])
    })
    print(f"Created {len(df)} sample reviews")

df.head()

Creating sample data...
Created 3000 sample reviews


Unnamed: 0,urlDrugName,condition,benefitsReview,sideEffectsReview,commentsReview,rating,sideEffects,effectiveness,split
0,Zoloft,Depression,This medication helped me significantly.,Some minor side effects initially.,Overall satisfied with this treatment.,8.0,Severe Side Effects,Moderately Effective,train
1,Lisinopril,Pain,This medication helped me significantly.,Some minor side effects initially.,Overall satisfied with this treatment.,9.0,Moderate Side Effects,Moderately Effective,train
2,Synthroid,Type 2 Diabetes,This medication helped me significantly.,Some minor side effects initially.,Overall satisfied with this treatment.,10.0,Moderate Side Effects,Moderately Effective,train
3,Ambien,Insomnia,This medication helped me significantly.,Some minor side effects initially.,Overall satisfied with this treatment.,9.0,Severe Side Effects,Considerably Effective,train
4,Zoloft,Pain,This medication helped me significantly.,Some minor side effects initially.,Overall satisfied with this treatment.,10.0,Mild Side Effects,Considerably Effective,train


In [5]:
# Create 3NF SQLite Database
def build_3nf_sqlite(df, db_path):
    print("=" * 60)
    print("BUILDING 3NF SQLITE DATABASE")
    print("=" * 60)

    if os.path.exists(db_path):
        os.remove(db_path)

    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    # Create dimension tables
    cur.executescript("""
        DROP TABLE IF EXISTS reviews;
        DROP TABLE IF EXISTS drugs;
        DROP TABLE IF EXISTS conditions;
        DROP TABLE IF EXISTS side_effects;
        DROP TABLE IF EXISTS effectiveness_levels;

        CREATE TABLE drugs (
            drug_id INTEGER PRIMARY KEY,
            drug_name TEXT NOT NULL UNIQUE
        );

        CREATE TABLE conditions (
            condition_id INTEGER PRIMARY KEY,
            condition_name TEXT NOT NULL UNIQUE
        );

        CREATE TABLE side_effects (
            side_effect_id INTEGER PRIMARY KEY,
            side_effect_name TEXT NOT NULL UNIQUE,
            severity_order INTEGER NOT NULL
        );

        CREATE TABLE effectiveness_levels (
            effectiveness_id INTEGER PRIMARY KEY,
            effectiveness_name TEXT NOT NULL UNIQUE,
            effectiveness_order INTEGER NOT NULL
        );

        CREATE TABLE reviews (
            review_id INTEGER PRIMARY KEY,
            drug_id INTEGER NOT NULL,
            condition_id INTEGER NOT NULL,
            benefits_review TEXT,
            side_effects_review TEXT,
            comments_review TEXT,
            rating REAL NOT NULL,
            side_effect_id INTEGER NOT NULL,
            effectiveness_id INTEGER NOT NULL,
            split TEXT NOT NULL,
            FOREIGN KEY (drug_id) REFERENCES drugs(drug_id),
            FOREIGN KEY (condition_id) REFERENCES conditions(condition_id),
            FOREIGN KEY (side_effect_id) REFERENCES side_effects(side_effect_id),
            FOREIGN KEY (effectiveness_id) REFERENCES effectiveness_levels(effectiveness_id)
        );
    """)
    print("Tables created.")

    # Insert dimension data
    drugs = df['urlDrugName'].unique()
    for i, drug in enumerate(drugs, 1):
        cur.execute("INSERT INTO drugs (drug_id, drug_name) VALUES (?, ?)", (i, drug))

    conditions = df['condition'].unique()
    for i, cond in enumerate(conditions, 1):
        cur.execute("INSERT INTO conditions (condition_id, condition_name) VALUES (?, ?)", (i, cond))

    side_effects_order = ['No Side Effects', 'Mild Side Effects', 'Moderate Side Effects', 'Severe Side Effects', 'Extremely Severe Side Effects']
    for i, se in enumerate(side_effects_order, 1):
        cur.execute("INSERT INTO side_effects (side_effect_id, side_effect_name, severity_order) VALUES (?, ?, ?)", (i, se, i))

    effectiveness_order = ['Ineffective', 'Marginally Effective', 'Moderately Effective', 'Considerably Effective', 'Highly Effective']
    for i, eff in enumerate(effectiveness_order, 1):
        cur.execute("INSERT INTO effectiveness_levels (effectiveness_id, effectiveness_name, effectiveness_order) VALUES (?, ?, ?)", (i, eff, i))

    # Create lookup dicts
    drug_map = {d: i for i, d in enumerate(drugs, 1)}
    cond_map = {c: i for i, c in enumerate(conditions, 1)}
    se_map = {s: i for i, s in enumerate(side_effects_order, 1)}
    eff_map = {e: i for i, e in enumerate(effectiveness_order, 1)}

    # Insert reviews
    for idx, row in df.iterrows():
        cur.execute("""
            INSERT INTO reviews (drug_id, condition_id, benefits_review, side_effects_review,
                                 comments_review, rating, side_effect_id, effectiveness_id, split)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            drug_map[row['urlDrugName']],
            cond_map[row['condition']],
            row.get('benefitsReview', ''),
            row.get('sideEffectsReview', ''),
            row.get('commentsReview', ''),
            row['rating'],
            se_map.get(row['sideEffects'], 3),
            eff_map.get(row['effectiveness'], 3),
            row['split']
        ))

    conn.commit()
    conn.close()
    print(f"\nDatabase created at: {db_path}")
    print(f"Total reviews: {len(df)}")

db_path = f"{base_folder}/data/drug_reviews.db"
build_3nf_sqlite(df, db_path)

BUILDING 3NF SQLITE DATABASE
Tables created.

Database created at: /content/drive/MyDrive/Colab Notebooks/drug_review_classification/data/drug_reviews.db
Total reviews: 3000


In [6]:
# Verify database
conn = sqlite3.connect(db_path)

print("Drugs table:")
print(pd.read_sql("SELECT * FROM drugs LIMIT 5", conn))

print("\nConditions table:")
print(pd.read_sql("SELECT * FROM conditions LIMIT 5", conn))

print("\nEffectiveness levels:")
print(pd.read_sql("SELECT * FROM effectiveness_levels", conn))

print("\nReviews count by effectiveness:")
print(pd.read_sql("""
    SELECT e.effectiveness_name, COUNT(*) as count
    FROM reviews r
    JOIN effectiveness_levels e ON r.effectiveness_id = e.effectiveness_id
    GROUP BY e.effectiveness_name
    ORDER BY e.effectiveness_order
""", conn))

conn.close()

Drugs table:
   drug_id   drug_name
0        1      Zoloft
1        2  Lisinopril
2        3   Synthroid
3        4      Ambien
4        5      Lyrica

Conditions table:
   condition_id       condition_name
0             1           Depression
1             2                 Pain
2             3      Type 2 Diabetes
3             4             Insomnia
4             5  High Blood Pressure

Effectiveness levels:
   effectiveness_id      effectiveness_name  effectiveness_order
0                 1             Ineffective                    1
1                 2    Marginally Effective                    2
2                 3    Moderately Effective                    3
3                 4  Considerably Effective                    4
4                 5        Highly Effective                    5

Reviews count by effectiveness:
       effectiveness_name  count
0             Ineffective    317
1    Marginally Effective    400
2    Moderately Effective    791
3  Considerably Effective    8

In [7]:
# Function to load data from database
def get_dataframe_from_db(db_path):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql("""
        SELECT
            r.review_id,
            d.drug_name as urlDrugName,
            c.condition_name as condition,
            r.benefits_review as benefitsReview,
            r.side_effects_review as sideEffectsReview,
            r.comments_review as commentsReview,
            r.rating,
            s.side_effect_name as sideEffects,
            e.effectiveness_name as effectiveness,
            r.split
        FROM reviews r
        JOIN drugs d ON r.drug_id = d.drug_id
        JOIN conditions c ON r.condition_id = c.condition_id
        JOIN side_effects s ON r.side_effect_id = s.side_effect_id
        JOIN effectiveness_levels e ON r.effectiveness_id = e.effectiveness_id
        ORDER BY r.review_id
    """, conn)
    conn.close()
    return df

df_loaded = get_dataframe_from_db(db_path)
print(f"Loaded {len(df_loaded)} reviews from database")
print(f"Train: {len(df_loaded[df_loaded['split']=='train'])}, Test: {len(df_loaded[df_loaded['split']=='test'])}")
df_loaded.head()

Loaded 3000 reviews from database
Train: 2400, Test: 600


Unnamed: 0,review_id,urlDrugName,condition,benefitsReview,sideEffectsReview,commentsReview,rating,sideEffects,effectiveness,split
0,1,Zoloft,Depression,This medication helped me significantly.,Some minor side effects initially.,Overall satisfied with this treatment.,8.0,Severe Side Effects,Moderately Effective,train
1,2,Lisinopril,Pain,This medication helped me significantly.,Some minor side effects initially.,Overall satisfied with this treatment.,9.0,Moderate Side Effects,Moderately Effective,train
2,3,Synthroid,Type 2 Diabetes,This medication helped me significantly.,Some minor side effects initially.,Overall satisfied with this treatment.,10.0,Moderate Side Effects,Moderately Effective,train
3,4,Ambien,Insomnia,This medication helped me significantly.,Some minor side effects initially.,Overall satisfied with this treatment.,9.0,Severe Side Effects,Considerably Effective,train
4,5,Zoloft,Pain,This medication helped me significantly.,Some minor side effects initially.,Overall satisfied with this treatment.,10.0,Mild Side Effects,Considerably Effective,train
