In [1]:
from pyspark.context import SparkContext
from pyspark.conf import SparkConf

from pyspark.sql import SparkSession, HiveContext
from pyspark.sql import Window

import pyspark.sql.functions as f

#from pyspark.sql.functions import avg, mean, stddev, udf, col, min, max, round
from pyspark.sql.types import BooleanType, BinaryType, DoubleType, FloatType, IntegerType

from copy import copy, deepcopy

import pandas as pd
import numpy as np


## Get/Set spark attributes

In [2]:
spark.sparkContext._conf.getAll()

conf_list = [('spark.app.name', 'Spark_uPdAtEd')]
conf = spark.sparkContext._conf.setAll(conf_list)
spark.sparkContext.stop()
spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

In [3]:
filepath = "./telco-customer-churn.csv"

df = spark.read.csv(filepath, header=True)

print((df.columns))
print(df.printSchema())

['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: string (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: string (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: strin

In [6]:
def change_type(type_, *cols):
    global df
    
    for c in cols:
        df = df.withColumn(c, f.col(c).cast(type_))
        
    return df


In [7]:
columns = ["MonthlyCharges", "TotalCharges"]
df = change_type(FloatType(), *columns)

# Queries using the Spark Syntax

In [8]:
df.select(f.stddev("MonthlyCharges")).show()

+---------------------------+
|stddev_samp(MonthlyCharges)|
+---------------------------+
|          30.09004712627172|
+---------------------------+



In [9]:
avg_df = df.groupBy("gender").agg(f.mean("MonthlyCharges").alias("monthly_avg"), f.mean("TotalCharges").alias("total_avg"))\
                            .orderBy("total_avg", ascending=True)
avg_df.show()

+------+-----------------+------------------+
|gender|      monthly_avg|         total_avg|
+------+-----------------+------------------+
|Female|65.20424314321728|2283.1909871540843|
|  Male|64.32748243651142| 2283.407860119869|
+------+-----------------+------------------+



In [11]:
df.groupBy("gender").agg(f.round(f.min("TotalCharges"), 1).alias("minimumTotalCharges"), f.max(
    "TotalCharges").alias("maximum")).sort("gender", ascending=False).show()

+------+-------------------+-------+
|gender|minimumTotalCharges|maximum|
+------+-------------------+-------+
|  Male|               18.8| 8684.8|
|Female|               18.9|8672.45|
+------+-------------------+-------+



# Queries using the SQL Syntax

In [12]:
df.createOrReplaceTempView(name="telco")
spark.catalog.listTables()

[Table(name='telco', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [13]:
query = "SELECT gender, COUNT(gender) AS number_of_people FROM telco GROUP BY gender"
spark.sql(query).show()

+------+----------------+
|gender|number_of_people|
+------+----------------+
|Female|            3488|
|  Male|            3555|
+------+----------------+



In [14]:
query = "SELECT gender, ROUND(MIN(TotalCharges), 1) AS rounded, MAX(TotalCharges) AS maximum FROM telco GROUP BY gender"
spark.sql(query).show()

+------+-------+-------+
|gender|rounded|maximum|
+------+-------+-------+
|Female|   18.9|8672.45|
|  Male|   18.8| 8684.8|
+------+-------+-------+



#  Fetching only the rows where column B has the maximum value if we PARTITION BY A


In [15]:
data = [
    ('a', 5, 'v'),
    ('a', 5, 'v'),
    ('a', 8, 'v'),
    ('e', 7, 'v'),
    ('b', 1, "c"),
    ('b', 3, "c"),
    ('c', 3, "c")
]
df2 = spark.createDataFrame(data, ["A", "B", "C"])
df2.show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  a|  5|  v|
|  a|  5|  v|
|  a|  8|  v|
|  e|  7|  v|
|  b|  1|  c|
|  b|  3|  c|
|  c|  3|  c|
+---+---+---+



In [19]:
w = Window.partitionBy("A")

df2.withColumn("maxB", f.max("B").over(w)).where(f.col("maxB") == f.col("B")).drop(f.col("maxB")).sort(f.col("A"), ascending=True).show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  a|  8|  v|
|  b|  3|  c|
|  c|  3|  c|
|  e|  7|  v|
+---+---+---+



## SQL statement

In [20]:
df2.registerTempTable("example") # df2.createOrReplaceTempView("example")

query = "SELECT * FROM (SELECT *, MAX(B) OVER (PARTITION BY A) AS maxB FROM example) WHERE B=maxB"
spark.sql(query).show()

+---+---+---+----+
|  A|  B|  C|maxB|
+---+---+---+----+
|  e|  7|  v|   7|
|  c|  3|  c|   3|
|  b|  3|  c|   3|
|  a|  8|  v|   8|
+---+---+---+----+



## Spark Statement

In [22]:
w = Window.partitionBy("A")
df2.withColumn("maxB", f.max("B").over(w)).where(f.col("B") == f.col("maxB")).show()


+---+---+---+----+
|  A|  B|  C|maxB|
+---+---+---+----+
|  e|  7|  v|   7|
|  c|  3|  c|   3|
|  b|  3|  c|   3|
|  a|  8|  v|   8|
+---+---+---+----+



## Register a custom function to use in SQL statement

In [33]:
def square(x):
    return x**2


spark.udf.register("SQUARE", square, returnType=FloatType())


df2.withColumn('squared_B', square(f.col("B"))).show()

+---+---+---+---------+
|  A|  B|  C|squared_B|
+---+---+---+---------+
|  a|  5|  v|     25.0|
|  a|  5|  v|     25.0|
|  a|  8|  v|     64.0|
|  e|  7|  v|     49.0|
|  b|  1|  c|      1.0|
|  b|  3|  c|      9.0|
|  c|  3|  c|      9.0|
+---+---+---+---------+



# `Crosstab`

In [23]:
df2.crosstab(col1='A', col2='B').show()

+---+---+---+---+---+---+
|A_B|  1|  3|  5|  7|  8|
+---+---+---+---+---+---+
|  e|  0|  0|  0|  1|  0|
|  b|  1|  1|  0|  0|  0|
|  a|  0|  0|  2|  0|  1|
|  c|  0|  1|  0|  0|  0|
+---+---+---+---+---+---+



## `Coalesce` / `Repartition`

- Coalesce can **only** minimize the number of partitions, in contrast with repartition.

In [24]:
data = [(10, "blue"),
  (13, "red"),
  (15, "blue"),
  (99, "red"),
  (67, "blue")]
df = spark.createDataFrame(data=data).toDF('age', 'color')
df.show()

+---+-----+
|age|color|
+---+-----+
| 10| blue|
| 13|  red|
| 15| blue|
| 99|  red|
| 67| blue|
+---+-----+



In [25]:
df.rdd.glom().collect()

[[Row(age=10, color='blue')],
 [Row(age=13, color='red')],
 [Row(age=15, color='blue')],
 [Row(age=99, color='red'), Row(age=67, color='blue')]]

In [26]:
div_df = df.coalesce(2)
div_df.rdd.getNumPartitions()

2

In [27]:
div_df.rdd.glom().collect()

[[Row(age=10, color='blue'), Row(age=13, color='red')],
 [Row(age=15, color='blue'),
  Row(age=99, color='red'),
  Row(age=67, color='blue')]]

In [28]:
# Default number of partitions 200
print(df.repartition('color').rdd.glom().collect())

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [Row(age=13, color='red'), Row(age=99, color='red')], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [Row(age=10, color='blue'), Row(age=15, color='blue'), Row(age=67, color='blue')], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]


# Real world example


Suppose we have $2\times 10^9$ rows split into $13,000$ partitions, but we need to take only $2,000$ random rows. If we sample, we will still have $13,000$ partitions, so most of the partitions will be empty. Thus, we need to repartition the sample.

In general:
`number_of_partitions = number_of_CPUs in the cluster * 2, 3, 4`

**Why did we use the repartition method instead of coalesce?**

A full data shuffle is an expensive operation for large data sets, but our data puddle is only 2,000 rows. The repartition method returns equal sized text files, which are more efficient for downstream consumers.