In [2]:
from pyspark.sql import SparkSession
from  pyspark.sql import functions as FS
from  pyspark.sql import Window  as WN
from pyspark.sql import types as TYP

from pyspark.conf import SparkConf

In [4]:
spark = SparkSession.builder.appName("Test").config("spark.driver.memory","1G").getOrCreate()
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 104857600)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [3]:
# spark.sparkContext.getConf().getAll()


In [5]:
spark

In [6]:
employee_date = [
(1,'JOE',85000,1),
(2,'Henry',80000,2),
(3,'Sam',60000,2),
(4,'Max',90000,1),
(5,'Janet',69000,1),
(6,'Randy',85000,1),
(7,'Will',70000,1)
]
emp_schema = [ 'ID', 'NAME', 'SALARY','DEPARTMENTID']
dept_data = [
(1,'IT'),
(2,'Sales')
]
dept_schema = [ 'ID', 'NAME']

In [7]:
emp_df = spark.createDataFrame(data =employee_date,schema = emp_schema)
emp_df.show()

+---+-----+------+------------+
| ID| NAME|SALARY|DEPARTMENTID|
+---+-----+------+------------+
|  1|  JOE| 85000|           1|
|  2|Henry| 80000|           2|
|  3|  Sam| 60000|           2|
|  4|  Max| 90000|           1|
|  5|Janet| 69000|           1|
|  6|Randy| 85000|           1|
|  7| Will| 70000|           1|
+---+-----+------+------------+



In [9]:
dept_df = spark.createDataFrame(data =dept_data,schema = dept_schema)
dept_df.show()

+---+-----+
| ID| NAME|
+---+-----+
|  1|   IT|
|  2|Sales|
+---+-----+



In [16]:
join_data = emp_df.join(dept_df, emp_df.DEPARTMENTID ==dept_df.ID).\
select(emp_df.ID.alias("emp_id"),emp_df.NAME,emp_df.SALARY,dept_df.ID.alias("dept_id"),dept_df.NAME.alias("dpt"))
join_data.show()

+------+-----+------+-------+-----+
|emp_id| NAME|SALARY|dept_id|  dpt|
+------+-----+------+-------+-----+
|     1|  JOE| 85000|      1|   IT|
|     4|  Max| 90000|      1|   IT|
|     5|Janet| 69000|      1|   IT|
|     6|Randy| 85000|      1|   IT|
|     7| Will| 70000|      1|   IT|
|     2|Henry| 80000|      2|Sales|
|     3|  Sam| 60000|      2|Sales|
+------+-----+------+-------+-----+



In [26]:
rnk_data = join_data.withColumn("rnk",FS.dense_rank().over(WN.partitionBy('dept_id').orderBy(FS.desc('SALARY'))))

In [27]:
rnk_data.filter("rnk == 1").select("*").show()

+------+-----+------+-------+-----+---+
|emp_id| NAME|SALARY|dept_id|  dpt|rnk|
+------+-----+------+-------+-----+---+
|     4|  Max| 90000|      1|   IT|  1|
|     2|Henry| 80000|      2|Sales|  1|
+------+-----+------+-------+-----+---+



In [28]:
## Second Method
join_data = emp_df.join(dept_df, emp_df.DEPARTMENTID ==dept_df.ID).\
select(emp_df.ID.alias("emp_id"),emp_df.NAME,emp_df.SALARY,dept_df.ID.alias("dept_id"),dept_df.NAME.alias("dpt"))
join_data.show()

+------+-----+------+-------+-----+
|emp_id| NAME|SALARY|dept_id|  dpt|
+------+-----+------+-------+-----+
|     1|  JOE| 85000|      1|   IT|
|     4|  Max| 90000|      1|   IT|
|     5|Janet| 69000|      1|   IT|
|     6|Randy| 85000|      1|   IT|
|     7| Will| 70000|      1|   IT|
|     2|Henry| 80000|      2|Sales|
|     3|  Sam| 60000|      2|Sales|
+------+-----+------+-------+-----+



In [30]:
max_sal_data = emp_df.groupBy("DEPARTMENTID").agg(FS.max("SALARY").alias("MAX_SAL"))
max_sal_data.show()

+------------+-------+
|DEPARTMENTID|MAX_SAL|
+------------+-------+
|           1|  90000|
|           2|  80000|
+------------+-------+



In [31]:
join_data.join(FS.broadcast(max_sal_data),join_data.dept_id == max_sal_data.DEPARTMENTID,"inner").\
filter("SALARY == MAX_SAL").select("emp_id","NAME","SALARY","dpt").show()

+------+-----+------+-----+
|emp_id| NAME|SALARY|  dpt|
+------+-----+------+-----+
|     4|  Max| 90000|   IT|
|     2|Henry| 80000|Sales|
+------+-----+------+-----+



In [32]:
data_dict = [
{ "x": "y"},
{ "1": "2"},
{ "t": "y"}
]

# Output: {"x": "y", "1": "2", "t": "y"}
temp_dict = {}
for row in data_dict:
    for key,val in row.items():
        temp_dict[key] = val
print(temp_dict)  


{'x': 'y', '1': '2', 't': 'y'}


In [42]:
dataset = [
    {"id":"abc","amount":10},
    {"id":"def","amount":20}
]
df = spark.createDataFrame(dataset)
df.show()

+------+---+
|amount| id|
+------+---+
|    10|abc|
|    20|def|
+------+---+



In [53]:
df.collect()[0].asDict()

{'amount': 10, 'id': 'abc'}

In [44]:
df.toJSON().collect()

['{"amount":10,"id":"abc"}', '{"amount":20,"id":"def"}']

In [45]:
ids = df.rdd.map(lambda x: x[0]).collect()
ids

[10, 20]

In [46]:
amount = df.rdd.map(lambda x: x[0]).collect()
amount

[10, 20]

In [50]:
df.withColumn("ids",FS.lit(ids)).withColumn("amount2",FS.lit(amount)).drop("amount","id").collect()[0].asDict()

{'ids': [10, 20], 'amount2': [10, 20]}

In [63]:
dataset = [
    {"month":"Apr-2020","sales":10000},
     {"month":"May-2020","sales":12000},
     {"month":"Jun-2020","sales":11400},
]
sale_df = spark.createDataFrame(dataset)
sale_df.show()

+--------+-----+
|   month|sales|
+--------+-----+
|Apr-2020|10000|
|May-2020|12000|
|Jun-2020|11400|
+--------+-----+



In [69]:
def create_date_format(date_str):
    from datetime import datetime
    l1 = date_str.split("-")
    date_str1 = f"{l1[-1]}-{l1[0]}-01"
    return  datetime.strptime(date_str1,"%Y-%b-%d")

create_date_format_udf = FS.udf(lambda x: create_date_format(x),TYP.DateType())



In [81]:
sale_df.withColumn("date1",create_date_format_udf("month")).\
withColumn("previous_sale",FS.lag(FS.col('sales')).over((WN.partitionBy().orderBy("date1")))).\
withColumn("sales_growtn",FS.when(FS.col('previous_sale') == None,'NA').\
           otherwise((FS.col("sales") - FS.col('previous_sale'))/FS.col("previous_sale") * 100)).show()



+--------+-----+----------+-------------+------------+
|   month|sales|     date1|previous_sale|sales_growtn|
+--------+-----+----------+-------------+------------+
|Apr-2020|10000|2020-04-01|         NULL|        NULL|
|May-2020|12000|2020-05-01|        10000|        20.0|
|Jun-2020|11400|2020-06-01|        12000|        -5.0|
+--------+-----+----------+-------------+------------+



In [82]:
data = [(1, [1,2,3,4,5,6]), (2, [7,8,9,10,11,12,13])]
df_arr = spark.createDataFrame(data, ["dno", "eno"])
df_arr.show()

+---+--------------------+
|dno|                 eno|
+---+--------------------+
|  1|  [1, 2, 3, 4, 5, 6]|
|  2|[7, 8, 9, 10, 11,...|
+---+--------------------+



In [85]:
cricket_data = [("Virat Kohli", 85, 100, 75),
        ("Steve Smith", 90, 105, 80),
        ("Kane Williamson", 88, 95, 70)]
columns = ["Player", "Match1", "Match2", "Match3"]

In [86]:
cricket_df = spark.createDataFrame(data = cricket_data,schema = columns)
cricket_df.show()

+---------------+------+------+------+
|         Player|Match1|Match2|Match3|
+---------------+------+------+------+
|    Virat Kohli|    85|   100|    75|
|    Steve Smith|    90|   105|    80|
|Kane Williamson|    88|    95|    70|
+---------------+------+------+------+



In [88]:
stack_expr = "stack(3,'Match111', Match1,'Match2',Match2,'Match3',Match3) as (Match, Score)"
stack_df1 = cricket_df.select("Player",FS.expr(stack_expr))
stack_df1.show()

+---------------+--------+-----+
|         Player|   Match|Score|
+---------------+--------+-----+
|    Virat Kohli|Match111|   85|
|    Virat Kohli|  Match2|  100|
|    Virat Kohli|  Match3|   75|
|    Steve Smith|Match111|   90|
|    Steve Smith|  Match2|  105|
|    Steve Smith|  Match3|   80|
|Kane Williamson|Match111|   88|
|Kane Williamson|  Match2|   95|
|Kane Williamson|  Match3|   70|
+---------------+--------+-----+

