In [6]:
import os
import sys
# Get the directory of the current script
# Add the parent directory to sys.path
parent_dir = '..'
sys.path.append(os.path.abspath(parent_dir))

In [7]:
from pyspark.sql import SparkSession as Spark
from transformslib.tables.metaframe import MetaFrame

In [8]:
spark = Spark.builder.appName("demo").getOrCreate() #assume databricks made this for us

In [21]:
df = MetaFrame.load("test_tables/jobs/prod/job_1/date_table.csv", format="csv", table_name="my_table", frame_type="pyspark", spark=spark)

In [13]:
print(df.meta.events)

[{
  "log_info": {
    "filepath": "test_tables/jobs/prod/job_1/date_table.csv",
    "table_name": "my_table",
    "src_format": "csv"
  },
  "uuid": "0730b1dd-4a3a-4543-962e-b8bea4c353d5",
  "timestamp": "2025-09-23T02:09:46.541840+00:00",
  "executed_user": "Daniel",
  "macro_uuid": null,
  "event_type": "load",
  "event_description": "Loaded my_table from test_tables/jobs/prod/job_1/date_table.csv",
  "meta_version": "1.0",
  "class_type": "PipelineEvent"
}]


In [15]:
df.nrow

5

In [16]:
df.nvars

3

In [17]:
df.columns

['id', 'name', 'event_date']

In [None]:
df.sort()

In [23]:
df.show()

+---+----+----------+
| id|name|event_date|
+---+----+----------+
+---+----+----------+



In [9]:
from transformslib.tables.collections import SupplyLoad

In [19]:
dfs = SupplyLoad(1, enable_schema_validation=True, use_test_path=True, spark=spark)

Using sampling input method for job_id=1 (no run_id specified)
Starting supply loading from: ../test_tables/jobs/prod/job_1/sampling_state.json
Loading supplies from new sampling_state.json format
Loading table 'positions' from ../test_tables/jobs/prod/job_1/positions.csv (format: csv)
Casting columns for table 'positions' to expected schema...
Validating schema for table 'positions'...
Expected Schema:
  age: Int64 -> Int64
  name: String -> String
  var: String -> String
  position: String -> String
  skill: String -> String
Schema check - Column 'age': expected Int64, got LongType()
Schema check - Column 'name': expected String, got StringType()
Schema check - Column 'var': expected String, got StringType()
Schema check - Column 'position': expected String, got StringType()
Schema check - Column 'skill': expected String, got StringType()
Schema validation passed for table 'positions'
Loading table 'salary' from ../test_tables/jobs/prod/job_1/salary.csv (format: csv)
Casting columns 

In [11]:
dfs.get_table_names()

['positions',
 'salary',
 'location',
 'array_like',
 'decimal_table',
 'date_table',
 'state']

In [12]:
dfs["positions"].show()

+---+-----------+---+--------+------+
|age|       name|var|position| skill|
+---+-----------+---+--------+------+
|  1|   John Doe|  b|   front|  high|
|  2| Jane Smith|  d|    back|medium|
|  3|Bob Johnson|  e|  middle|   low|
|  3|Bob Johnson|  f|   front|   low|
|  4|     Twiggy|  b|   front|   low|
+---+-----------+---+--------+------+



In [13]:
dfs["positions"].nrow

5

In [14]:
from transformslib.transforms.atomiclib import *


 Transforms Library: 18 transforms available
   Use listatomic() to see all available transforms in a table format.



In [15]:
listatomic()


 TRANSFORMS LIBRARY - Available Transform Classes
 Total Transforms: 18
Transform Name     | Description
-------------------+--------------------------------------------------------------------------------------------------
ComplexFilter      | Transform class for filtering rows in a DataFrame using a backend-specific condition.
ConcatColumns      | Transform class for concatenating multiple columns into a single column.
DistinctTable      | Transform class for removing duplicate rows from a DataFrame.
DropNAValues       | Transform class for dropping rows with NA/None/Null values in a specified column.
DropVariable       | Transform class for removing one or more variables/columns from a DataFrame.
ExplodeColumn      | Transform class for exploding a list-like column into multiple rows.
ForceCase          | Transform class to force string values in a specified column to upper or lower case.
JoinTable          | Transform class for joining two tables in a TableCollection.
PartitionByV

In [17]:
dfs = DropVariable("name").apply(dfs, df="positions")

ValueError: Variables not found in DataFrame columns: ['name']

In [18]:
dfs["positions"].show()

+---+---+--------+------+
|age|var|position| skill|
+---+---+--------+------+
|  1|  b|   front|  high|
|  2|  d|    back|medium|
|  3|  e|  middle|   low|
|  3|  f|   front|   low|
|  4|  b|   front|   low|
+---+---+--------+------+



In [20]:
coll = dfs.select_by_suffix("_table")

In [21]:
print(coll.get_table_names())

['decimal_table', 'date_table']
