In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window as W


In [0]:
from pyspark.sql.types import DateType,StringType

In [0]:
spark = SparkSession.builder.appName("Luxsoft").getOrCreate()

### SQL

**Q1. What would be the answer of below query **

In [0]:
## Q1. What would be the answer of below query
sql = """
select 10
union select 10
union select 20
union  select 20
union all select 10
"""
spark.sql(sql).show()

In [0]:
data = [(10,'abc'),(20,'abc'),(None,'abc')]
schema = "id int, cd string"
df = spark.createDataFrame(data = data, schema = schema)
df.show()

**Q3. What would be the output of below query **


In [0]:
df.createOrReplaceTempView('T1')
sql = """
select id, cd  from T1 where ID = Null;
"""
spark.sql(sql).show()

##You cannot use = NULL or <> NULL because NULL is not equal or unequal to anything. NULL means unknown.
## So use IS NULL or IS NOT NULL:

In [0]:
df.filter(F.col('id').isNull()).show()
# df.filter("id is null").show()

In [0]:
sql = """
select id, cd  from T1 where ID is null;
"""
spark.sql(sql).show()


In [0]:
# df.filter(df.id.isNotNull()).show()
df.filter(df.id.isNull()).show()

**Q4. What would be the output of below query **


In [0]:
sql = """
SELECT count(*),count(ID),count(cd),count(distinct cd),sum(ID) from T1
"""
spark.sql(sql).show()

In [0]:
df.agg(F.count('*').alias('cnt'),
       F.count('id').alias('id_cnt'),
        F.count('id').alias('id_cnt'),
        F.countDistinct('cd').alias('dcd_cnt'),
        F.sum('id').alias('sum_cnt'),
      
      ).show()

**Q5. What would be the output of below query **

In [0]:
SQL = """
SELECT count(id) from T1
group by cd having count(id) = 1
"""
spark.sql(SQL).show()

In [0]:
df.groupBy("cd").agg(F.count('id').alias("cnt_id")).filter(F.col('cnt_id') == 1).show()

In [0]:
dataset = [(10,'mango','2024-01-01',100),
(10,'orange','2024-01-02',120),
(11,'jeans','2024-01-03',200),
(11,'jeans','2024-01-03',250),
(11,'T-shirt','2024-01-04',200),
(12,'Banana','2024-01-04',50)]
data_schema = "id int, notes string,sales_date string,amount int"
dataframe = spark.createDataFrame(data = dataset, schema = data_schema)
dataframe.printSchema()

In [0]:
# Convert sales_date data types
from pyspark.sql.types import DateType
dataframe2 = dataframe.withColumn("sales_date",dataframe['sales_date'].cast(DateType()))
# dataframe3 = dataframe.withColumn("sales_date",F.to_date('sales_date','yyyy-mm-dd'))
dataframe2.printSchema()
dataframe2.show()

**Q6--- fetch the max sale amount for each date ID wise, also get sum of sales amount id wise ---**

In [0]:
# IN window function sum,if we do not use orer by then it will summ all,if order there then commulative sum perfomed

dataframe2.createOrReplaceTempView('sales')
sql = """
WITH data_query as 
(
SELECT id,notes,sales_date,amount,
DENSE_RANK() OVER (PARTITION BY id,sales_date order by amount desc) as rnk,
sum(amount) OVER (PARTITION BY id order by amount asc) as commulative_sum
from sales
)
select id,notes,sales_date,amount,commulative_sum from data_query
where rnk = 1

"""
spark.sql(sql).show()

**Q:7:Produce Below output **
```
id		note
10	"mango,orange"
11	"jeans,T-shirt"
12	"Banana"
```

In [0]:
postgres_sql = """
SELECT id,STRING_AGG(DISTINCT notes,',') as note from sales group by id
"""
snow_sql = "SELECT id,listagg(DISTINCT notes,',') as note from sales group by id"
# spark.sql(postgres_sql).show()

In [0]:
dataframe2.groupBy('id').agg(F.collect_set('notes').alias('note')).\
withColumn('note',F.concat_ws(',',F.col('note'))).show()

In [0]:
(
dataframe2.groupBy("id").agg(
    F.collect_set("notes").alias("notes")
).withColumn("array_compact",F.array_join("notes",","))
).display()

In [0]:
@F.udf(StringType())
def concate_str(arr):
    return ','.join(arr)

dataframe2.groupBy('id').agg(F.collect_set('notes').alias('note')).withColumn('note',concate_str(F.col('note'))).show()

### Using Data Frame
#### Method-1

In [0]:
dataframe2.select("id","notes","sales_date","amount").\
withColumn("DenseRank",F.dense_rank().over(W.partitionBy("id").orderBy(F.col("amount").desc()))).\
withColumn("Sum",F.sum('amount').over(W.partitionBy("id").orderBy(F.col("amount").asc()))).\
filter(F.col("DenseRank") ==1).show()

#### Method-2

In [0]:
agg_data = dataframe2.select("id","notes","sales_date","amount").groupBy("id").agg(
    F.sum("amount").alias("sum_amount"),
    F.max("amount").alias("max_amont"),
)
dataframe2.join(agg_data,(dataframe2.id == agg_data.id) & (dataframe2.amount == agg_data.max_amont),
                how="inner" ).show()

### Collect List

In [0]:
dataframe2.groupBy('id').agg(F.collect_list("notes").alias("notes")).withColumn("_str",F.concat_ws(",", F.col("notes"))).show()

**Q:8 => Number of records with Right join, full outer join between two table **<br>
**See reference for similar question**:<br>
https://github.com/tauovir/pyspark/blob/master/src_notebok/vpropel/spark/Spark_Joins.ipynb

### Python

In [0]:
#1: which is fast dictionary or tuple in term of data access
#2: threading vs multi-processing
#3: What would be output for l1,l2
#4: how do you get next value from l2
#
l1 = [ele for ele in range(1,5)]
l1

In [0]:
l2 = (ele for ele in range(1,5))
l2

In [0]:
l2.__next__()

### Pyspark

In [0]:
# Every day you are getting large amount data file but it could have 5-10% new or updated data, how acces those data only