In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").getOrCreate()

Joins
Merging different datasets is a very generic requirement present in most of data-processing pipelines in the big data world. PySpark offers a very convenient way to merge and pivot your dataframe values, as required.

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [4]:
schema = StructType([ \
    StructField("user_id", StringType(), True), \
    StructField("country", StringType(), True), \
    StructField("browser", StringType(), True), \
    StructField("OS", StringType(), True), \
    StructField("age", IntegerType(), True) \
    ])

In [5]:
data2 = [("A203",'India',"Chrome","WIN", 33),
         ("A201",'China',"Safari","MacOS",35),
         ("A205",'UK',"Mozilla","Linux",25)]

In [6]:
df = spark.createDataFrame(data=data2,schema=schema)

In [8]:
region_data = spark.createDataFrame([('UK','Big London'),
                                     ('China','Yunnan'),
                                     ('India','Bihar')], schema=StructType() \
                                    .add("Country","string").add("Region","string"))

In [9]:
region_data.show()

+-------+----------+
|Country|    Region|
+-------+----------+
|     UK|Big London|
|  China|    Yunnan|
|  India|     Bihar|
+-------+----------+



In [10]:
 new_df=df.join(region_data,on='Country')

In [11]:
new_df.show()

+-------+-------+-------+-----+---+----------+
|country|user_id|browser|   OS|age|    Region|
+-------+-------+-------+-----+---+----------+
|  China|   A201| Safari|MacOS| 35|    Yunnan|
|  India|   A203| Chrome|  WIN| 33|     Bihar|
|     UK|   A205|Mozilla|Linux| 25|Big London|
+-------+-------+-------+-----+---+----------+



Pivoting
We can use the pivot function in PySpark to simply create a pivot view of the dataframe for specific columns

In [12]:
df=spark.read.options(delimiter=',', inferSchema='True', header='True').csv("data/Invistico_Airline.csv")

In [14]:
df.groupBy('Gender').pivot('Class').sum('Flight Distance').fillna(0).show()

+------+--------+--------+--------+
|Gender|Business|     Eco|Eco Plus|
+------+--------+--------+--------+
|Female|67308124|47526594| 7694611|
|  Male|66823534|58843684| 9148861|
+------+--------+--------+--------+



In [15]:
df.groupBy('Customer Type').pivot('satisfaction').avg('Age').fillna(0).show()

+-----------------+------------------+------------------+
|    Customer Type|      dissatisfied|         satisfied|
+-----------------+------------------+------------------+
|   Loyal Customer| 40.28649325768182|  42.1939376328628|
|disloyal Customer|31.116924778761064|27.928070175438595|
+-----------------+------------------+------------------+

