In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as FS
from pyspark.sql import Window as wn


In [2]:
spark = SparkSession.builder.master("local[2]").appName("Consecutive Login").getOrCreate()

In [3]:
_data = [(101,'02-01-2024','N'),
(101,'03-01-2024','Y'),
(101,'04-01-2024','N'),
(101,'07-01-2024','Y'),
(102,'01-01-2024','N'),
(102,'02-01-2024','Y'),
(102,'03-01-2024','Y'),
(102,'04-01-2024','N'),
(102,'05-01-2024','Y'),
(102,'06-01-2024','Y'),
(102,'07-01-2024','Y'),
(103,'01-01-2024','N'),
(103,'04-01-2024','N'),
(103,'05-01-2024','Y'),
(103,'06-01-2024','Y'),
(103,'07-01-2024','N')
]
_schema = ["emp_id" , "log_date" , "flag"]
dataframe = spark.createDataFrame(data = _data, schema =_schema)

In [4]:
dataframe.show()

+------+----------+----+
|emp_id|  log_date|flag|
+------+----------+----+
|   101|02-01-2024|   N|
|   101|03-01-2024|   Y|
|   101|04-01-2024|   N|
|   101|07-01-2024|   Y|
|   102|01-01-2024|   N|
|   102|02-01-2024|   Y|
|   102|03-01-2024|   Y|
|   102|04-01-2024|   N|
|   102|05-01-2024|   Y|
|   102|06-01-2024|   Y|
|   102|07-01-2024|   Y|
|   103|01-01-2024|   N|
|   103|04-01-2024|   N|
|   103|05-01-2024|   Y|
|   103|06-01-2024|   Y|
|   103|07-01-2024|   N|
+------+----------+----+



##### Retrieve information about consecutive login streaks fron employee who have loggedin fot at least two consecutive day
For each employee provided the emp_id,
the number of consecutive days loggedin<br>
the start_date of the streak and end_date of the streak.
##### Output:
```
 +------+---+----------+----------+---+
|emp_id|grp|start_date|  end_date|cnt|
+------+---+----------+----------+---+
|   102|  1|2024-01-02|2024-01-03|  2|
|   102|  2|2024-01-05|2024-01-07|  3|
|   103|  4|2024-01-05|2024-01-06|  2|
+------+---+----------+----------+---+

```

**Filter data that are absent**

In [5]:
filter_df = dataframe.filter(dataframe.flag == 'Y')
filter_df.show()

+------+----------+----+
|emp_id|  log_date|flag|
+------+----------+----+
|   101|03-01-2024|   Y|
|   101|07-01-2024|   Y|
|   102|02-01-2024|   Y|
|   102|03-01-2024|   Y|
|   102|05-01-2024|   Y|
|   102|06-01-2024|   Y|
|   102|07-01-2024|   Y|
|   103|05-01-2024|   Y|
|   103|06-01-2024|   Y|
+------+----------+----+



**Convert log_date to date format**

In [11]:
filter_df = filter_df.withColumn("log_date",FS.to_date(FS.col('log_date'),'dd-mm-yyyy'))
filter_df.printSchema()

root
 |-- emp_id: long (nullable = true)
 |-- log_date: date (nullable = true)
 |-- flag: string (nullable = true)



In [12]:
filter_df.show()

+------+----------+----+
|emp_id|  log_date|flag|
+------+----------+----+
|   101|2024-01-03|   Y|
|   101|2024-01-07|   Y|
|   102|2024-01-02|   Y|
|   102|2024-01-03|   Y|
|   102|2024-01-05|   Y|
|   102|2024-01-06|   Y|
|   102|2024-01-07|   Y|
|   103|2024-01-05|   Y|
|   103|2024-01-06|   Y|
+------+----------+----+



## Create row number partition by emp_id

In [14]:
partition_df = filter_df.withColumn("rn",FS.row_number().over(wn.partitionBy('emp_id').orderBy('log_date')))

In [15]:
partition_df.show()

+------+----------+----+---+
|emp_id|  log_date|flag| rn|
+------+----------+----+---+
|   101|2024-01-03|   Y|  1|
|   101|2024-01-07|   Y|  2|
|   102|2024-01-02|   Y|  1|
|   102|2024-01-03|   Y|  2|
|   102|2024-01-05|   Y|  3|
|   102|2024-01-06|   Y|  4|
|   102|2024-01-07|   Y|  5|
|   103|2024-01-05|   Y|  1|
|   103|2024-01-06|   Y|  2|
+------+----------+----+---+



### Get Day from date

In [16]:
partition2_df = partition_df.withColumn("day",FS.dayofmonth(FS.col('log_date')))
partition2_df.show()

+------+----------+----+---+---+
|emp_id|  log_date|flag| rn|day|
+------+----------+----+---+---+
|   101|2024-01-03|   Y|  1|  3|
|   101|2024-01-07|   Y|  2|  7|
|   102|2024-01-02|   Y|  1|  2|
|   102|2024-01-03|   Y|  2|  3|
|   102|2024-01-05|   Y|  3|  5|
|   102|2024-01-06|   Y|  4|  6|
|   102|2024-01-07|   Y|  5|  7|
|   103|2024-01-05|   Y|  1|  5|
|   103|2024-01-06|   Y|  2|  6|
+------+----------+----+---+---+



### make a group of consecute day 

In [23]:
grp_df = partition2_df.withColumn('grp',FS.col('day') - FS.col('rn'))
grp_df.show()

+------+----------+----+---+---+---+
|emp_id|  log_date|flag| rn|day|grp|
+------+----------+----+---+---+---+
|   101|2024-01-03|   Y|  1|  3|  2|
|   101|2024-01-07|   Y|  2|  7|  5|
|   102|2024-01-02|   Y|  1|  2|  1|
|   102|2024-01-03|   Y|  2|  3|  1|
|   102|2024-01-05|   Y|  3|  5|  2|
|   102|2024-01-06|   Y|  4|  6|  2|
|   102|2024-01-07|   Y|  5|  7|  2|
|   103|2024-01-05|   Y|  1|  5|  4|
|   103|2024-01-06|   Y|  2|  6|  4|
+------+----------+----+---+---+---+



## Get the Min and max date as start and end date and filter by count

In [21]:
answer_df = grp_df.groupBy(grp_df.emp_id,grp_df.grp).agg(
    FS.min(FS.col('log_date')).alias("start_date"),
    FS.max(FS.col('log_date')).alias("end_date"),
    FS.count('*').alias("cnt")
).filter(FS.col('cnt') > 1)
answer_df.show()

+------+---+----------+----------+---+
|emp_id|grp|start_date|  end_date|cnt|
+------+---+----------+----------+---+
|   102|  1|2024-01-02|2024-01-03|  2|
|   102|  2|2024-01-05|2024-01-07|  3|
|   103|  4|2024-01-05|2024-01-06|  2|
+------+---+----------+----------+---+

