Skip to content
This repository is currently being migrated. It's locked while the migration is in progress.
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions integration/gradient_test_custom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Databricks notebook source
# MAGIC %md
# MAGIC This notebook generates a computationally expensive spark job of varying length.

# COMMAND ----------

from pyspark.sql import SparkSession
from pyspark.sql.functions import rand

# Define the number of rows and columns
num_rows = 10000000
num_cols = 25

# Create a DataFrame with random values
df = spark.range(num_rows).selectExpr(["rand() as rand_col_" + str(i) for i in range(num_cols)])

# Temp Table
df.createOrReplaceGlobalTempView("generated_table")

# Show the DataFrame
display(df)

# COMMAND ----------

# Display a filter
display(spark.sql("select rand_col_3 from global_temp.generated_table where rand_col_3 > 0.5"))

# COMMAND ----------

# Display another filter
display(spark.sql("select * from global_temp.generated_table where rand_col_3 != rand_col_4"))

# COMMAND ----------

# MAGIC %sql
# MAGIC -- Display a visualization
# MAGIC select * from global_temp.generated_table;

# COMMAND ----------

# Display a join
display(spark.sql("select a.rand_col_3 from global_temp.generated_table a, global_temp.generated_table b where a.rand_col_3 == b.rand_col_3"))

# COMMAND ----------

# Perform Computationally Expensive Work

import numpy as np

# ADJUST THESE
i_iterations = 6
j_interations = 500

# Function to perform a complex and time-consuming operation
def complex_operation(x):
result = 0
for i in range(i_iterations):
for j in range(j_interations):
result += np.sqrt(np.log(x) * np.exp(x) / (i + 1) ** 2)
return result

# Registering the UDF
spark.udf.register("complex_operation", complex_operation)

# Apply the UDF to calculate factorial for a few of the columns just so this is a computatioanlly expensive job
df0 = spark.sql("SELECT *, complex_operation(CAST((rand_col_0 * 10) as int)) as factorial_value FROM global_temp.generated_table")

# Force Compute
df0.show()

# Show the DataFrame
display(df0)


# COMMAND ----------