In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window as W


In [2]:
spark = SparkSession.builder.appName("Luxsoft").getOrCreate()

### SQL

**Q1. What would be the answer of below query **

In [5]:
## Q1. What would be the answer of below query
sql = """
select 10
union select 10
union select 20
union  select 20
union all select 10
"""
spark.sql(sql).show()

+---+
| 10|
+---+
| 10|
| 20|
| 10|
+---+



In [7]:
data = [(10,'abc'),(20,'abc'),(None,'abc')]
schema = "id int, cd string"
df = spark.createDataFrame(data = data, schema = schema)
df.show()

+----+---+
|  id| cd|
+----+---+
|  10|abc|
|  20|abc|
|NULL|abc|
+----+---+



**Q3. What would be the output of below query **


In [12]:
df.createOrReplaceTempView('T1')
sql = """
select id, cd  from T1 where ID = Null;
"""
spark.sql(sql).show()

##You cannot use = NULL or <> NULL because NULL is not equal or unequal to anything. NULL means unknown.
## So use IS NULL or IS NOT NULL:

+---+---+
| id| cd|
+---+---+
+---+---+



In [14]:
sql = """
select id, cd  from T1 where ID is null;
"""
spark.sql(sql).show()


+----+---+
|  id| cd|
+----+---+
|NULL|abc|
+----+---+



**Q4. What would be the output of below query **


In [15]:
sql = """
SELECT count(*),count(ID),count(cd),count(distinct cd),sum(ID) from T1
"""
spark.sql(sql).show()

+--------+---------+---------+------------------+-------+
|count(1)|count(ID)|count(cd)|count(DISTINCT cd)|sum(ID)|
+--------+---------+---------+------------------+-------+
|       3|        2|        3|                 1|     30|
+--------+---------+---------+------------------+-------+



**Q5. What would be the output of below query **

In [17]:
SQL = """
SELECT count(id) from T1
group by cd having count(id) = 1
"""
spark.sql(SQL).show()

+---------+
|count(id)|
+---------+
+---------+



In [27]:
dataset = [(10,'mango','2024-01-01',100),
(10,'orange','2024-01-02',120),
(11,'jeans','2024-01-03',200),
(11,'jeans','2024-01-03',250),
(11,'T-shirt','2024-01-04',200),
(12,'Banana','2024-01-04',50)]
data_schema = "id int, notes string,sales_date string,amount int"
dataframe = spark.createDataFrame(data = dataset, schema = data_schema)
dataframe.printSchema()

root
 |-- id: integer (nullable = true)
 |-- notes: string (nullable = true)
 |-- sales_date: string (nullable = true)
 |-- amount: integer (nullable = true)



In [33]:
# Convert sales_date data types
from pyspark.sql.types import DateType
dataframe2 = dataframe.withColumn("sales_date",dataframe['sales_date'].cast(DateType()))
# dataframe3 = dataframe.withColumn("sales_date",F.to_date('sales_date','yyyy-mm-dd'))
dataframe2.printSchema()
dataframe2.show()

root
 |-- id: integer (nullable = true)
 |-- notes: string (nullable = true)
 |-- sales_date: date (nullable = true)
 |-- amount: integer (nullable = true)

+---+-------+----------+------+
| id|  notes|sales_date|amount|
+---+-------+----------+------+
| 10|  mango|2024-01-01|   100|
| 10| orange|2024-01-02|   120|
| 11|  jeans|2024-01-03|   200|
| 11|  jeans|2024-01-03|   250|
| 11|T-shirt|2024-01-04|   200|
| 12| Banana|2024-01-04|    50|
+---+-------+----------+------+



**Q6--- fetch the max sale amount for each date ID wise, also get sum of sales amount id wise ---**

In [34]:
dataframe2.createOrReplaceTempView('sales')
sql = """
WITH data_query as 
(
SELECT id,notes,sales_date,amount,
DENSE_RANK() OVER (PARTITION BY id,sales_date order by amount desc) as rnk,
sum(amount) OVER (PARTITION BY id order by amount asc) as commulative_sum
from sales
)
select id,notes,sales_date,amount,commulative_sum from data_query
where rnk = 1

"""
spark.sql(sql).show()

+---+-------+----------+------+---------------+
| id|  notes|sales_date|amount|commulative_sum|
+---+-------+----------+------+---------------+
| 10|  mango|2024-01-01|   100|            100|
| 10| orange|2024-01-02|   120|            220|
| 11|  jeans|2024-01-03|   250|            650|
| 11|T-shirt|2024-01-04|   200|            400|
| 12| Banana|2024-01-04|    50|             50|
+---+-------+----------+------+---------------+



**Q:7:Produce Below output **
```
id		note
10	"mango,orange"
11	"jeans,T-shirt"
12	"Banana"
```

In [40]:
postgres_sql = """
SELECT id,STRING_AGG(DISTINCT notes,',') as note from sales group by id
"""
snow_sql = "SELECT id,listagg(DISTINCT notes,',') as note from sales group by id"
# spark.sql(postgres_sql).show()

**Q:8 => Number of records with Right join, full outer join between two table **<br>
**See reference for similar question**:<br>
https://github.com/tauovir/pyspark/blob/master/src_notebok/vpropel/spark/Spark_Joins.ipynb

### Python

In [46]:
#1: which is fast dictionary or tuple in term of data access
#2: threading vs multi-processing
#3: What would be output for l1,l2
#4: how do you get next value from l2
#
l1 = [ele for ele in range(1,5)]
l1

[1, 2, 3, 4]

In [45]:
l2 = (ele for ele in range(1,5))
l2

<generator object <genexpr> at 0x0000026922DF9CB0>

In [47]:
l2.__next__()

1

### Pyspark

In [49]:
# Every day you are getting large amount data file but it could have 5-10% new or updated data, how acces those data only