### Need to find employees who logged in for 3 consecutive days.

#### Steps to solve the problem
- 1: Remove duplicate logins per day : This is important because emp 103 logged in twice on 2024-01-13.
- 2: If dates are consecutive : login_date - row_number() remains constant.
- 3: Pick employees with streak â‰¥ 3 days

### Ouput

In [0]:
%sql
select 101 as emp_id
union 
select 103 as emp_id

In [0]:
%sql
CREATE TABLE emp_login (
    emp_id INT,
    login_date DATE
);
INSERT INTO emp_login (emp_id, login_date) VALUES
(101, '2024-01-01'),
(101, '2024-01-02'),
(101, '2024-01-03'),
(102, '2024-01-01'),
(102, '2024-01-03'),
(102, '2024-01-04'),
(103, '2024-01-10'),
(103, '2024-01-11'),
(103, '2024-01-12'),
(103, '2024-01-13'),
(103, '2024-01-13');

In [0]:
%sql
select * from emp_login limit 3;

### SQL

In [0]:
%sql
with distinct_login as(
  select DISTINCT emp_id, login_date from emp_login
),
grp as (
select emp_id, login_date,
 day(login_date) - (row_number() over (partition by emp_id order by login_date)) as day_grp
from distinct_login
)

-- 3 consecutive days
select emp_id from grp
group by emp_id,day_grp having count(*) > 2

In [0]:
%sql
create or replace temporary view emp_login_tmp as
select * from emp_login;

### Pyspark

In [0]:
spark.table("emp_login_tmp").limit(3).display()

In [0]:
from pyspark.sql import functions as f
from pyspark.sql import Window as w

In [0]:
distinct_val = spark.table("emp_login_tmp").select("emp_id","login_date").distinct()
wind_function = w.partitionBy("emp_id").orderBy(f.col("login_date"))
grp_records = distinct_val.withColumn("day_grp",(f.day(f.col("login_date"))) - f.row_number().over(wind_function))
grp_records.display()

In [0]:
(
    grp_records.groupBy("emp_id","day_grp")
    .agg(f.count("*").alias("grp_count")).filter(f.col("grp_count") > 2).select("emp_id").display()
)