In [42]:
# we are using a new database to keep things separate from the other notebooks
# Connect to the SQLite database
import sqlite3
import pandas as pd

conn = sqlite3.connect("../../database/distTrans.sqlite3")

# Create a cursor object
cursor = conn.cursor()

In [20]:

# Create tables to demonstrate distribution transparency
def create_distributed_tables():
    # Drop existing tables if they exist
    cursor.executescript('''
        DROP TABLE IF EXISTS EMPLOYEE;
        DROP TABLE IF EXISTS EMP1_NY;
        DROP TABLE IF EXISTS EMP2_ATL;
        DROP TABLE IF EXISTS EMP3_MIA;
        DROP TABLE IF EXISTS DEPT1_SMALL;
        DROP TABLE IF EXISTS DEPT2_LARGE;
    ''')

    
    
    # Create Employee tables for different locations
    cursor.executescript('''
                         

        -- Global Employee Table
        CREATE TABLE EMPLOYEE (
            EMP_NAME TEXT NOT NULL,
            EMP_DOB DATE NOT NULL,
            EMP_ADDRESS TEXT NOT NULL,
            EMP_DEPARTMENT TEXT NOT NULL,
            EMP_SALARY DECIMAL(10,2) NOT NULL,
            EMP_LOCATION TEXT NOT NULL
        );
                         
                                          
        -- New York Office Employees
        CREATE TABLE EMP1_NY (
            EMP_NAME TEXT NOT NULL,
            EMP_DOB DATE NOT NULL,
            EMP_ADDRESS TEXT NOT NULL,
            EMP_DEPARTMENT TEXT NOT NULL,
            EMP_SALARY DECIMAL(10,2) NOT NULL,
            EMP_LOCATION TEXT DEFAULT 'New York'
        );

        -- Atlanta Office Employees
        CREATE TABLE EMP2_ATL (
            EMP_NAME TEXT NOT NULL,
            EMP_DOB DATE NOT NULL,
            EMP_ADDRESS TEXT NOT NULL,
            EMP_DEPARTMENT TEXT NOT NULL,
            EMP_SALARY DECIMAL(10,2) NOT NULL,
            EMP_LOCATION TEXT DEFAULT 'Atlanta'
        );

        -- Miami Office Employees
        CREATE TABLE EMP3_MIA (
            EMP_NAME TEXT NOT NULL,
            EMP_DOB DATE NOT NULL,
            EMP_ADDRESS TEXT NOT NULL,
            EMP_DEPARTMENT TEXT NOT NULL,
            EMP_SALARY DECIMAL(10,2) NOT NULL,
            EMP_LOCATION TEXT DEFAULT 'Miami'
        );

        -- Department tables based on budget
        CREATE TABLE DEPT1_SMALL (
            DEPT_CODE TEXT PRIMARY KEY,
            DIVISION INTEGER NOT NULL,
            BUDGET DECIMAL(10,2) CHECK(BUDGET < 1000000)
        );

        CREATE TABLE DEPT2_LARGE (
            DEPT_CODE TEXT PRIMARY KEY,
            DIVISION INTEGER NOT NULL,
            BUDGET DECIMAL(10,2) CHECK(BUDGET >= 1000000)
        );
    ''')

In [21]:
create_distributed_tables()
print("Tables created successfully")

Tables created successfully


In [43]:

# Data for all employees
all_employees = [
    # New York employees
    ('Alice', '1958-05-15', '123 Main St', 'D1', 70000, 'New York'),
    ('Bob', '1970-11-20', '456 Oak Ave', 'D2', 60000, 'New York'),
    ('Carol', '1955-02-10', '789 Pine Ln', 'D3', 80000, 'New York'),
    ('Reja', '1965-03-22', '890 Tech Blvd', 'D1', 150000, 'New York'),
    ('Rahul Sharma', '1975-07-14', '234 Innovation Dr', 'D3', 95000, 'New York'),
    ('Priya Patel', '1980-12-05', '567 Startup Ln', 'D2', 85000, 'New York'),
    ('Rohan Gupta', '1985-09-18', '890 Silicon Rd', 'D1', 120000, 'New York'),
    ('Sara Johnson', '1990-06-30', '123 Tech Park Dr', 'D2', 110000, 'New York'),
    ('Tom Smith', '1995-03-15', '456 Innovation Ave', 'D3', 130000, 'New York'),
    
    # Atlanta employees
    ('David', '1962-08-03', '101 Elm Rd', 'D1', 75000, 'Atlanta'),
    ('Eve', '1959-12-28', '202 Maple Dr', 'D1', 85000, 'Atlanta'),
    ('Frank', '1975-04-12', '303 Cedar Ct', 'D3', 65000, 'Atlanta'),
    ('Sanjay Reddy', '1972-09-17', '404 Peachtree St', 'D3', 72000, 'Atlanta'),
    ('Deepa Krishnamurthy', '1978-05-30', '505 Georgia Ave', 'D3', 80000, 'Atlanta'),
    ('Vikram Iyer', '1973-11-08', '606 Tech Park Rd', 'D2', 90000, 'Atlanta'),
    ('Neha Patel', '1981-07-21', '707 Peachtree Ave', 'D1', 95000, 'Atlanta'),
    ('Rajesh Kumar', '1985-03-14', '808 Technology Dr', 'D2', 100000, 'Atlanta'),
    ('Sneha Rao', '1988-10-27', '909 Innovation Blvd', 'D3', 110000, 'Atlanta'),
    
    # Miami employees
    ('Grace', '1968-09-25', '404 Birch St', 'D1', 62000, 'Miami'),
    ('Henry', '1957-06-18', '505 Willow Pl', 'D1', 82000, 'Miami'),
    ('Ivy', '1980-01-05', '606 Spruce Wy', 'D1', 78000, 'Miami'),
    ('Ananya Gupta', '1982-02-15', '707 Ocean Dr', 'D1', 68000, 'Miami'),
    ('Rohan Mehta', '1976-08-22', '808 Sunshine Blvd', 'D1', 85000, 'Miami'),
    ('Neha Desai', '1979-04-10', '909 Beach Rd', 'D2', 75000, 'Miami'),
    ('Priya Chopra', '1983-11-03', '1010 Coral Ct', 'D3', 90000, 'Miami'),
    ('Rohan Patel', '1985-07-19', '1111 Palm Dr', 'D3', 95000, 'Miami'),
    ('Neha Shah', '1987-03-06', '1212 Coconut Wy', 'D3', 100000, 'Miami'),
    ('Priya Desai', '1989-09-12', '1313 Pine St', 'D3', 110000, 'Miami')
]

# Insert into global EMPLOYEE table
try:
    cursor.executemany('''
        INSERT INTO EMPLOYEE (EMP_NAME, EMP_DOB, EMP_ADDRESS, EMP_DEPARTMENT, EMP_SALARY, EMP_LOCATION)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', all_employees)
    conn.commit()
    print("Global EMPLOYEE table populated successfully.")
except sqlite3.IntegrityError as e:
    print(f"IntegrityError in EMPLOYEE table: {e}")


Global EMPLOYEE table populated successfully.


In [41]:
# If want to delete the records then only run this function
def delete_records_sqlite():
    try:
        
        # Delete all records from Employees table
        cursor.execute('DELETE FROM Employee')
        
        # Commit the changes
        conn.commit()
        print(f"Deleted {cursor.rowcount} records from Employees table.")
    
    except sqlite3.Error as e:
        print(f"SQLite Error: {e}")
    
    

if __name__ == '__main__':
    # SQLite example
    delete_records_sqlite()           

SQLite Error: Cannot operate on a closed database.


In [44]:
# Insert into fragment tables based on location
try:
    # New York employees
    cursor.execute('''
        INSERT INTO EMP1_NY 
        SELECT * FROM EMPLOYEE WHERE EMP_LOCATION = 'New York'
    ''')
    
    # Atlanta employees
    cursor.execute('''
        INSERT INTO EMP2_ATL 
        SELECT * FROM EMPLOYEE WHERE EMP_LOCATION = 'Atlanta'
    ''')
    
    # Miami employees
    cursor.execute('''
        INSERT INTO EMP3_MIA 
        SELECT * FROM EMPLOYEE WHERE EMP_LOCATION = 'Miami'
    ''')
    
    conn.commit()
    print("Fragment tables populated successfully.")
except sqlite3.IntegrityError as e:
    print(f"IntegrityError in fragment tables: {e}")


Fragment tables populated successfully.


In [45]:
print("\nNew York Fragment (EMP1_NY):")
print(pd.read_sql("SELECT * FROM EMP1_NY", conn))

print("\nAtlanta Fragment (EMP2_ATL):")
print(pd.read_sql("SELECT * FROM EMP2_ATL", conn))

print("\nMiami Fragment (EMP3_MIA):")
print(pd.read_sql("SELECT * FROM EMP3_MIA", conn))


New York Fragment (EMP1_NY):
        EMP_NAME     EMP_DOB         EMP_ADDRESS      EMP_DEPARTMENT  \
0          Alice  1958-05-15         123 Main St               Sales   
1            Bob  1970-11-20         456 Oak Ave                  HR   
2          Carol  1955-02-10         789 Pine Ln             Finance   
3           Reja  1965-03-22       890 Tech Blvd                 CTO   
4   Rahul Sharma  1975-07-14   234 Innovation Dr         Engineering   
5    Priya Patel  1980-12-05      567 Startup Ln  Product Management   
6          Alice  1958-05-15         123 Main St                  D1   
7            Bob  1970-11-20         456 Oak Ave                  D2   
8          Carol  1955-02-10         789 Pine Ln                  D3   
9           Reja  1965-03-22       890 Tech Blvd                  D1   
10  Rahul Sharma  1975-07-14   234 Innovation Dr                  D3   
11   Priya Patel  1980-12-05      567 Startup Ln                  D2   
12   Rohan Gupta  1985-09-18      

In [46]:
# Data for small departments
small_departments = [
    ('D1', 1, 500000),
    ('D2', 4, 600000),
    ('D3', 5, 700000)
]

try:
    cursor.executemany('''
        INSERT INTO DEPT1_SMALL (DEPT_CODE, DIVISION, BUDGET)
        VALUES (?, ?, ?)
    ''', small_departments)
    conn.commit()
    print("\nSmall departments inserted successfully.")
except sqlite3.IntegrityError as e:
    print(f"IntegrityError: {e}")

# Fetch and display small departments
small_dept_result = pd.read_sql("SELECT * FROM DEPT1_SMALL", conn)
print("\nSmall Departments:")
print(small_dept_result)

IntegrityError: UNIQUE constraint failed: DEPT1_SMALL.DEPT_CODE

Small Departments:
  DEPT_CODE  DIVISION  BUDGET
0        D1         1  500000
1        D2         4  600000
2        D3         5  700000


In [27]:
# Data for large departments
large_departments = [
    ('D4', 2, 1500000),
    ('D5', 4, 2000000),
    ('D6', 5, 1800000)
]

try:
    cursor.executemany('''
        INSERT INTO DEPT2_LARGE (DEPT_CODE, DIVISION, BUDGET)
        VALUES (?, ?, ?)
    ''', large_departments)
    conn.commit()
    print("\nLarge departments inserted successfully.")
except sqlite3.IntegrityError as e:
    print(f"IntegrityError: {e}")

# Fetch and display large departments
large_dept_result = pd.read_sql("SELECT * FROM DEPT2_LARGE", conn)
print("\nLarge Departments:")
print(large_dept_result)


Large departments inserted successfully.

Large Departments:
  DEPT_CODE  DIVISION   BUDGET
0        D4         2  1500000
1        D5         4  2000000
2        D6         5  1800000



## Example queries to demonstrate different transparency levels



> **Query 1:** Get a  list of employees with salary greater than 50000

In [31]:
# Level 1: Centralized Query (Using global EMPLOYEE table)
def level1_high_salary():
    query = '''
    SELECT EMP_NAME, EMP_DEPARTMENT, EMP_SALARY, EMP_LOCATION
    FROM EMPLOYEE
    WHERE EMP_SALARY > 50000
    ORDER BY EMP_SALARY DESC;
    '''
    return pd.read_sql(query, conn)
print("Level 1 - Centralized Query Result:")
print(level1_high_salary())

Level 1 - Centralized Query Result:
               EMP_NAME      EMP_DEPARTMENT  EMP_SALARY EMP_LOCATION
0                  Reja                 CTO      150000     New York
1          Rahul Sharma         Engineering       95000     New York
2           Vikram Iyer                  IT       90000      Atlanta
3           Priya Patel  Product Management       85000     New York
4                   Eve                  IT       85000      Atlanta
5           Rohan Mehta             Finance       85000        Miami
6                 Henry             Finance       82000        Miami
7                 Carol             Finance       80000     New York
8   Deepa Krishnamurthy           Marketing       80000      Atlanta
9                   Ivy                  IT       78000        Miami
10                David           Marketing       75000      Atlanta
11           Neha Desai           Marketing       75000        Miami
12         Sanjay Reddy               Sales       72000      Atlant

In [32]:
# Level 2: Basic Distribution Transparency (Using UNION of fragments)
def level2_high_salary():
    query = '''
    SELECT EMP_NAME, EMP_DEPARTMENT, EMP_SALARY, EMP_LOCATION
    FROM (
        SELECT * FROM EMP1_NY
        UNION ALL
        SELECT * FROM EMP2_ATL
        UNION ALL
        SELECT * FROM EMP3_MIA
    )
    WHERE EMP_SALARY > 50000
    ORDER BY EMP_SALARY DESC;
    '''
    return pd.read_sql(query, conn)

print("\nLevel 2 - Basic Distribution Transparency Result:")
print(level2_high_salary())


Level 2 - Basic Distribution Transparency Result:
               EMP_NAME      EMP_DEPARTMENT  EMP_SALARY EMP_LOCATION
0                  Reja                 CTO      150000     New York
1          Rahul Sharma         Engineering       95000     New York
2           Vikram Iyer                  IT       90000      Atlanta
3           Priya Patel  Product Management       85000     New York
4                   Eve                  IT       85000      Atlanta
5           Rohan Mehta             Finance       85000        Miami
6                 Henry             Finance       82000        Miami
7                 Carol             Finance       80000     New York
8   Deepa Krishnamurthy           Marketing       80000      Atlanta
9                   Ivy                  IT       78000        Miami
10                David           Marketing       75000      Atlanta
11           Neha Desai           Marketing       75000        Miami
12         Sanjay Reddy               Sales       72

In [33]:
# Level 3: Location-Aware Distribution (Using explicit location-based fragments)
def level3_high_salary():
    query = '''
    SELECT EMP_NAME, EMP_DEPARTMENT, EMP_SALARY, 'New York' as EMP_LOCATION
    FROM EMP1_NY
    WHERE EMP_SALARY > 50000
    
    UNION ALL
    
    SELECT EMP_NAME, EMP_DEPARTMENT, EMP_SALARY, 'Atlanta' as EMP_LOCATION
    FROM EMP2_ATL
    WHERE EMP_SALARY > 50000
    
    UNION ALL
    
    SELECT EMP_NAME, EMP_DEPARTMENT, EMP_SALARY, 'Miami' as EMP_LOCATION
    FROM EMP3_MIA
    WHERE EMP_SALARY > 50000
    
    ORDER BY EMP_SALARY DESC;
    '''
    return pd.read_sql(query, conn)

print("\nLevel 3 - Location-Aware Distribution Result:")
print(level3_high_salary())


Level 3 - Location-Aware Distribution Result:
               EMP_NAME      EMP_DEPARTMENT  EMP_SALARY EMP_LOCATION
0                  Reja                 CTO      150000     New York
1          Rahul Sharma         Engineering       95000     New York
2           Vikram Iyer                  IT       90000      Atlanta
3           Priya Patel  Product Management       85000     New York
4                   Eve                  IT       85000      Atlanta
5           Rohan Mehta             Finance       85000        Miami
6                 Henry             Finance       82000        Miami
7                 Carol             Finance       80000     New York
8   Deepa Krishnamurthy           Marketing       80000      Atlanta
9                   Ivy                  IT       78000        Miami
10                David           Marketing       75000      Atlanta
11           Neha Desai           Marketing       75000        Miami
12         Sanjay Reddy               Sales       72000 

>> All three queries will return the same results, but they represent different approaches to handling distributed data. In a real distributed system, Level 3 would be most similar to how the query would actually be executed, with each location processing its part of the query independently.

> **Query 2** :   Show employees with salary > 50k along with their department's budget information.

In [None]:
# Level 1: Fragmented Transparency (Using global EMPLOYEE table)
def level1_emp_dept_query():
    query = '''
    SELECT 
        e.EMP_NAME,
        e.EMP_DEPARTMENT,
        e.EMP_SALARY,
        e.EMP_LOCATION,
        CASE 
            WHEN d1.BUDGET IS NOT NULL THEN d1.BUDGET
            WHEN d2.BUDGET IS NOT NULL THEN d2.BUDGET
        END as DEPT_BUDGET,
        CASE 
            WHEN d1.DIVISION IS NOT NULL THEN d1.DIVISION
            WHEN d2.DIVISION IS NOT NULL THEN d2.DIVISION
        END as DIVISION
    FROM EMPLOYEE e
    LEFT JOIN DEPT1_SMALL d1 ON e.EMP_DEPARTMENT = d1.DEPT_CODE
    LEFT JOIN DEPT2_LARGE d2 ON e.EMP_DEPARTMENT = d2.DEPT_CODE
    WHERE e.EMP_SALARY > 50000
    ORDER BY e.EMP_SALARY DESC;
    '''
    return pd.read_sql(query, conn)

print("Level 1 - Centralized Query Result:")
print(level1_emp_dept_query())

In [None]:
# Level 2: Locatioon Transparency
def level2_emp_dept_query():
    query = '''
    WITH combined_employees AS (
        SELECT * FROM EMP1_NY
        UNION ALL
        SELECT * FROM EMP2_ATL
        UNION ALL
        SELECT * FROM EMP3_MIA
    ),
    combined_departments AS (
        SELECT DEPT_CODE, DIVISION, BUDGET FROM DEPT1_SMALL
        UNION ALL
        SELECT DEPT_CODE, DIVISION, BUDGET FROM DEPT2_LARGE
    )
    SELECT 
        e.EMP_NAME,
        e.EMP_DEPARTMENT,
        e.EMP_SALARY,
        e.EMP_LOCATION,
        d.BUDGET as DEPT_BUDGET,
        d.DIVISION
    FROM combined_employees e
    LEFT JOIN combined_departments d ON e.EMP_DEPARTMENT = d.DEPT_CODE
    WHERE e.EMP_SALARY > 50000
    ORDER BY e.EMP_SALARY DESC;
    '''
    return pd.read_sql(query, conn)

print("\nLevel 2 - Location Transparency Result:")
print(level2_emp_dept_query())

>> I Hope these examples are self-explanatory and demonstrate the concept of distribution transparency. 
>> Make you own queries on the same dataset and see how the results . 