In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *

spark = SparkSession.builder.master("local").appName("Pivot").getOrCreate()
sc = spark.sparkContext

In [53]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")

"""
Pivot(Transpose) in spark dataframe
To perform a pivot operation, we need atleast one numerical column
1. apply group by, apply pivot function, add all columns
2. apply group by, find total. Merge two dataframes using join condition
"""
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("delimiter","|").option("header",True).csv(filepath + "IntPivot.csv",inferSchema=True)
df.show()

+-------+-------+-----+
|ROLL_NO|SUBJECT|MARKS|
+-------+-------+-----+
|   1001|English|   84|
|   1001|Physics|   55|
|   1001|  Maths|   45|
|   1001|Science|   35|
|   1001|History|   32|
|   1002|English|   84|
|   1002|Physics|   62|
|   1002|  Maths|   78|
|   1002|Science|   96|
|   1002|History|   32|
+-------+-------+-----+



In [35]:
df.printSchema()

root
 |-- ROLL_NO: integer (nullable = true)
 |-- SUBJECT: string (nullable = true)
 |-- MARKS: integer (nullable = true)



In [31]:
int_df = df.withColumn("MarksInt",col("Marks").cast(IntegerType())) \
.withColumn("RollInt",col("Roll_NO").cast(IntegerType())) \
.drop("Roll_no").drop("Marks")
#use cast("int") or cast(IntegerType())
#InferSchema also detects the type, but here it is changing to double

In [32]:
int_df.printSchema()

root
 |-- SUBJECT: string (nullable = true)
 |-- MarksInt: integer (nullable = true)
 |-- RollInt: integer (nullable = true)



In [36]:
int_df.show()

+---------+--------+-------+
|  SUBJECT|MarksInt|RollInt|
+---------+--------+-------+
| English |      84|   1001|
|  Physics|      55|   1001|
|   Maths |      45|   1001|
|  Science|      35|   1001|
| History |      32|   1001|
| English |      84|   1002|
|  Physics|      62|   1002|
|   Maths |      78|   1002|
| Science |      96|   1002|
|  History|      32|   1002|
+---------+--------+-------+



In [57]:
tbl = df.groupBy("Roll_No").pivot("Subject").max("Marks") 
#without max(marks), it is of GroupedData type. With max operation it is DF

In [58]:
type(tbl)

pyspark.sql.dataframe.DataFrame

In [59]:
tbl.show()

+-------+-------+-------+-----+-------+-------+
|Roll_No|English|History|Maths|Physics|Science|
+-------+-------+-------+-----+-------+-------+
|   1002|     84|     32|   78|     62|     96|
|   1001|     84|     32|   45|     55|     35|
+-------+-------+-------+-----+-------+-------+



In [68]:
tbl_tot = tbl.withColumn("Total",tbl["English"]+tbl["History"]+tbl["Maths"]+tbl["Physics"]+tbl["Science"])

In [70]:
tbl_tot.show()

+-------+-------+-------+-----+-------+-------+-----+
|Roll_No|English|History|Maths|Physics|Science|Total|
+-------+-------+-------+-----+-------+-------+-----+
|   1002|     84|     32|   78|     62|     96|  352|
|   1001|     84|     32|   45|     55|     35|  251|
+-------+-------+-------+-----+-------+-------+-----+



In [None]:
#Groupby rollno in first table and sum marks. Join this with the pivot table on rollno.

df2 = df.groupBy("Roll_NO").agg({"Marks": "sum"}).withColumnRenamed("sum(Marks)", "Total")