In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("Processing_Data").getOrCreate()

#### Q1).  Lets say in orders table orderID, date, cost, profit columns there. in profit column positive and negative values will be there. you need to find maximum length of continuous negative profits and starting index on this negative series.

In [41]:
schema=StructType([StructField("orderID", IntegerType(), True),\
                  StructField("orderDate", DateType(),True),\
                  StructField("cost", IntegerType(), True),\
                  StructField("profit", DoubleType(), True)])

In [42]:
df = spark.read.csv("1.csv", header=True, schema=schema)
df.createOrReplaceTempView("Table_1")
df.show()


+-------+----------+----+------+
|orderID| orderDate|cost|profit|
+-------+----------+----+------+
|      1|2022-01-01| 100|  10.5|
|      2|2022-01-02| 150| -5.25|
|      3|2022-01-03| 120| -8.75|
|      4|2022-01-04| 200| -15.5|
|      5|2022-01-05| 180| 12.75|
|      6|2022-01-06|  90|  3.25|
|      7|2022-01-07| 110| -20.0|
|      8|2022-01-08| 130|  -5.5|
|      9|2022-01-09| 180| 12.75|
|     10|2022-01-10|  90|  4.25|
|     11|2022-01-11| 110| -20.0|
|     12|2022-01-12| 130|  -5.5|
|     13|2022-01-13| 110| -21.0|
|     14|2022-01-14| 130|  -5.5|
+-------+----------+----+------+



In [43]:
df.printSchema()

root
 |-- orderID: integer (nullable = true)
 |-- orderDate: date (nullable = true)
 |-- cost: integer (nullable = true)
 |-- profit: double (nullable = true)



In [48]:
df2 = spark.sql("""
                SELECT orderID, orderDate, cost, profit,
                ROW_NUMBER() OVER(ORDER BY orderDate) r1,
                ROW_NUMBER() OVER(PARTITION BY CASE WHEN profit < 0 then 1 else 0 end ORDER BY orderDate) r2,
                ROW_NUMBER() OVER(ORDER BY orderDate) -
                ROW_NUMBER() OVER(PARTITION BY CASE WHEN profit < 0 then 1 else 0 end ORDER BY orderDate) as group
                FROM Table_1
                """).show()

# ROW_NUMBER() OVER(PARTITION BY CASE WHEN profit < 0 then 1 else 0 end ORDER BY orderDate) r2

+-------+----------+----+------+---+---+-----+
|orderID| orderDate|cost|profit| r1| r2|group|
+-------+----------+----+------+---+---+-----+
|      1|2022-01-01| 100|  10.5|  1|  1|    0|
|      5|2022-01-05| 180| 12.75|  5|  2|    3|
|      6|2022-01-06|  90|  3.25|  6|  3|    3|
|      9|2022-01-09| 180| 12.75|  9|  4|    5|
|     10|2022-01-10|  90|  4.25| 10|  5|    5|
|      2|2022-01-02| 150| -5.25|  2|  1|    1|
|      3|2022-01-03| 120| -8.75|  3|  2|    1|
|      4|2022-01-04| 200| -15.5|  4|  3|    1|
|      7|2022-01-07| 110| -20.0|  7|  4|    3|
|      8|2022-01-08| 130|  -5.5|  8|  5|    3|
|     11|2022-01-11| 110| -20.0| 11|  6|    5|
|     12|2022-01-12| 130|  -5.5| 12|  7|    5|
|     13|2022-01-13| 110| -21.0| 13|  8|    5|
|     14|2022-01-14| 130|  -5.5| 14|  9|    5|
+-------+----------+----+------+---+---+-----+



In [52]:
df3 = spark.sql("""
                SELECT MIN(orderID) AS Start_Index, count(*) as Max_Negative_seq
                FROM(
                SELECT orderID, orderDate, cost, profit,
                ROW_NUMBER() OVER(ORDER BY orderDate) r1,
                ROW_NUMBER() OVER(PARTITION BY CASE WHEN profit < 0 then 1 else 0 end ORDER BY orderDate) r2,
                ROW_NUMBER() OVER(ORDER BY orderDate) -
                ROW_NUMBER() OVER(PARTITION BY CASE WHEN profit < 0 then 1 else 0 end ORDER BY orderDate) as group
                FROM Table_1) T1 WHERE profit<0
                GROUP BY group ORDER BY Max_Negative_seq DESC 
                LIMIT 1
                """).show()

# ROW_NUMBER() OVER(PARTITION BY CASE WHEN profit < 0 then 1 else 0 end ORDER BY orderDate) r2

+-----------+----------------+
|Start_Index|Max_Negative_seq|
+-----------+----------------+
|         11|               4|
+-----------+----------------+

