In [1]:
import pandas as pd

# Employee dataset
employee_data = {
    "id": [1,2,3,4,5],
    "name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "department": ["HR", "Finance", "IT", "Finance", "HR"],
    "salary": [5000,7000,6000,8000,5500],
    "hire_date": ["2018-03-01","2017-07-12","2019-11-23","2016-05-04","2020-01-10"],
    "country": ["US","UK","IN","US","IN"]
}
employee_df = pd.DataFrame(employee_data)
employee_df.to_csv("employee.csv", index=False)

# Transactions dataset
txn_data = {
    "txn_id": [101,102,103,104,105],
    "emp_id": [1,2,3,4,5],
    "amount": [200,500,300,1000,150],
    "txn_type": ["Travel","Reimburse","Travel","Training","Supplies"],
    "txn_date": ["2021-06-15","2021-07-01","2021-07-19","2021-08-22","2021-09-05"]
}
txn_df = pd.DataFrame(txn_data)
txn_df.to_csv("transactions.csv", index=False)

print("✅ Files created: employee.csv, transactions.csv")


✅ Files created: employee.csv, transactions.csv


Step 1: Install and Initialize PySpark in Colab

In [2]:
!pip install pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("UnityCatalogMiniProject") \
    .getOrCreate()




Step 2: Simulate a Catalog & Schema

In [3]:
# Create database (acts like UC schema)
spark.sql("CREATE DATABASE IF NOT EXISTS finance_db")
spark.sql("USE finance_db")


DataFrame[]

Step 3: Create Sample Tables

In [5]:
data = [(1, "Alice", 5000), (2, "Bob", 7000), (3, "Charlie", 6000)]
df = spark.createDataFrame(data, ["id", "name", "salary"])
df.write.mode("overwrite").saveAsTable("finance_db.employee")

spark.sql("SELECT * FROM finance_db.employee").show()


+---+-------+------+
| id|   name|salary|
+---+-------+------+
|  2|    Bob|  7000|
|  3|Charlie|  6000|
|  1|  Alice|  5000|
+---+-------+------+



Step 4: Data Discovery

In [6]:
# Show databases and tables
spark.sql("SHOW DATABASES").show()
spark.sql("SHOW TABLES IN finance_db").show()


+----------+
| namespace|
+----------+
|   default|
|finance_db|
+----------+

+----------+---------+-----------+
| namespace|tableName|isTemporary|
+----------+---------+-----------+
|finance_db| employee|      false|
+----------+---------+-----------+



Step 5: Access Control Simulation

In [7]:
# Example: filter data based on "role"
user_role = "HR"

if user_role == "HR":
    spark.sql("SELECT id, name FROM finance_db.employee").show()
else:
    spark.sql("SELECT * FROM finance_db.employee").show()


+---+-------+
| id|   name|
+---+-------+
|  2|    Bob|
|  3|Charlie|
|  1|  Alice|
+---+-------+



Step 6: Data Lineage Simulation

In [8]:
# Derived dataset: employees with bonus
spark.sql("""
    CREATE OR REPLACE TEMP VIEW employee_bonus AS
    SELECT id, name, salary, salary * 0.1 AS bonus
    FROM finance_db.employee
""")

spark.sql("SELECT * FROM employee_bonus").show()


+---+-------+------+-----+
| id|   name|salary|bonus|
+---+-------+------+-----+
|  2|    Bob|  7000|700.0|
|  3|Charlie|  6000|600.0|
|  1|  Alice|  5000|500.0|
+---+-------+------+-----+



Step 7: Audit Logging

In [9]:
import datetime

def log_action(user, action):
    with open("audit_log.txt", "a") as f:
        f.write(f"{datetime.datetime.now()} | {user} | {action}\n")

log_action("user1", "SELECT on finance_db.employee")
log_action("user2", "INSERT on finance_db.employee")
