In [0]:
data = [
 ("C001", "2024-01-01"),
 ("C001", "2024-01-04"),
 ("C001", "2024-01-06"),
 ("C002", "2024-01-03"),
 ("C002", "2024-01-05"),
]

df = spark.createDataFrame(data, ["customer_id", "billing_date"])

df.display()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, datediff, date_add, date_sub

In [0]:
customer_window=Window.partitionBy("customer_id").orderBy("billing_date")

prev_date_df=df.withColumn("prev_date",lag("billing_date",1).over(customer_window))

prev_date_df.display()

gap_df=prev_date_df.filter(datediff(col("billing_date"),col("prev_date"))>1)

gap_df.display()

result_df=gap_df.withColumn("missing_from",date_add(col("prev_date"),1)).withColumn("missing_to",date_sub(col('billing_date'),1))

result_df.display()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, min, max, explode, sequence, to_date

# Initialize Spark
spark = SparkSession.builder.appName("MissingBillingDates").getOrCreate()

# Sample data
data = [
    ("C001", "2024-01-01"),
    ("C001", "2024-01-02"),
    ("C001", "2024-01-04"),
    ("C001", "2024-01-06"),
    ("C002", "2024-01-03"),
    ("C002", "2024-01-05"),
]

# Create DataFrame
df = spark.createDataFrame(data, ["customer_id", "billing_date"]) \
    .withColumn("billing_date", to_date("billing_date"))
df.display()
# Step 1: Get min and max billing date per customer
date_range_df = df.groupBy("customer_id").agg(
    min("billing_date").alias("start_date"),
    max("billing_date").alias("end_date")
)
date_range_df.display()
# Step 2: Generate complete date range per customer
full_dates_df = date_range_df.withColumn("billing_date", explode(
    sequence(col("start_date"), col("end_date"))
)).select("customer_id", "billing_date")
full_dates_df.display()
# Step 3: Find missing dates via anti-join
missing_dates_df = full_dates_df.join(df, on=["customer_id", "billing_date"], how="anti")
# Show results
missing_dates_df.orderBy("customer_id", "billing_date").display()

In [0]:
#input: data = [(1,), (2,), (4,), (6,), (8,), (9,)]

#output expected : [(3,), (5,), (7,)]

#spark = SparkSession.builder.getOrCreate()

# Actual numbers DataFrame
data = [(1,), (2,), (4,), (6,), (8,), (9,)]
numbers_df = spark.createDataFrame(data, ["num"])

#Tip1 > create dataframe having values 1 to 9

# Full range DataFrame (1 to 9)
full_range_df = spark.range(1, 10).toDF("num")

# Use anti join to find missing numbers
missing_numbers = full_range_df.join(numbers_df, on="num", how="left_anti")
missing_numbers.show()

In [0]:
from pyspark.sql.window import *
from pyspark.sql.functions import *

data = [
    (101,["P1","P2"]),
    (102,["P1"]),
    (103,["P2","P3"])
]

df = spark.createDataFrame(data,["id","projects"])
df.display()

df_splitedlist = df.withColumn("projects", explode("projects"))
df_splitedlist.display()

df_result = df_splitedlist.groupBy("projects").agg(countDistinct("id").alias("count"))
df_result.display()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, lead
from pyspark.sql.window import Window

# Setup
spark = SparkSession.builder.getOrCreate()

# Sample sales data
data = [
    ("ProductA", "2024-01", 100),
    ("ProductA", "2024-02", 120),
    ("ProductA", "2024-03", 90),
    ("ProductB", "2024-01", 200),
    ("ProductB", "2024-02", 210),
    ("ProductB", "2024-03", 200),
]

columns = ["Product", "Month", "Revenue"]
df = spark.createDataFrame(data, columns)

In [0]:
# Define window partitioned by product and ordered by month
windowSpec = Window.partitionBy("Product").orderBy("Month")

# Add lag and lead columns
df_with_lag_lead = df.withColumn("Prev_Revenue", lag("Revenue", 1).over(windowSpec)) \
                     .withColumn("Next_Revenue", lead("Revenue", 1).over(windowSpec))

df_with_lag_lead.display()

In [0]:
from pyspark.sql.functions import col, lit, concat

df_with_lag_lead_wri = df_with_lag_lead.withColumn(
    "Revenue_Gap", 
    concat(
        (((col("Revenue") - col("Prev_Revenue")) / col("Revenue")) * lit(100)).cast("string"), 
        lit('%')
    )
).withColumn('Flag', lit('Flag'))
df_with_lag_lead_wri.display()

In [0]:
from pyspark.sql.functions import col, lit, concat

df_with_lag_lead_wri = df_with_lag_lead.withColumn(
    "Revenue_Gap", 
    concat(
        (((col("Revenue") - col("Prev_Revenue")) / col("Revenue")) * lit(100)).cast("string"), 
        lit('%')
    )
).withColumn('Flag1', lit('Flag'))
df_with_lag_lead_wri.display()

In [0]:
df_with_lag_lead_wri.write.mode("append").option("mergeSchema", "true").saveAsTable("rro.sales_data")

In [0]:
%sql
select * from rro.sales_data version as of 3;

In [0]:
%sql
RESTORE TABLE rro.sales_data TO VERSION AS OF 1;

In [0]:
_sqldf.display()

In [0]:
%sql
select * from rro.sales_data

In [0]:
from pyspark.sql.functions import col, lit, concat

df_with_lag_lead_wri = df_with_lag_lead.withColumn(
    "Revenue_Gap", 
    concat(
        (((col("Revenue") - col("Prev_Revenue")) / col("Revenue")) * lit(100)).cast("string"), 
        lit('%')
    )
).withColumn('Flag', lit('Flag')).withColumn('Flag1', lit('Flag')).withColumn('Flag2', lit('Flag'))
df_with_lag_lead_wri.display()

In [0]:
df_with_lag_lead_wri.select("Product","Month","Revenue","Prev_Revenue","Next_Revenue","Revenue_Gap","Flag","Flag1").write.mode("append").saveAsTable("rro.sales_data")