## Data Partition in PySpark

In [97]:
import findspark
findspark.init()
print("Done")

Done


## import 

In [98]:
import os
import glob

from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import asc, desc

## Create Spark

In [5]:
spark = SparkSession.Builder().master('local').getOrCreate()

## Get song_dat files

In [8]:
song_files = []
for root, dir, file in os.walk('data/song_data'):
    path = os.path.abspath(os.path.join(root, '*.json'))
    files = glob.glob(path)
    for file in files:
        song_files.append(file)
    
len(song_files)

70

## Read song_data to DataFrame song_df

In [9]:
song_df = spark.read.json(song_files)

In [11]:
song_df.columns

['artist_id',
 'artist_latitude',
 'artist_location',
 'artist_longitude',
 'artist_name',
 'duration',
 'num_songs',
 'song_id',
 'title',
 'year']

In [73]:
song_df.select(F.countDistinct('year').alias('dictinct_year')).show()

+-------------+
|dictinct_year|
+-------------+
|           21|
+-------------+



In [68]:
song_df.select('song_id', 'title', 'year').sort(desc('year')).show(10)

+------------------+--------------------+----+
|           song_id|               title|year|
+------------------+--------------------+----+
|SOINLJW12A8C13314C|       City Slickers|2008|
|SOFSOCN12A8C143F5D|      Face the Ashes|2007|
|SONYPOM12A8C13B2D7|I Think My Wife I...|2005|
|SOGVQGJ12AB017F169|           Ten Tonne|2005|
|SOBBUGU12A8C13E95D|Setting Fire to S...|2004|
|SORRZGD12A6310DBC3|      Harajuku Girls|2004|
|SOXILUQ12A58A7C72A|   Jenny Take a Ride|2004|
|SOXLBJT12A8C140925|   Caught In A Dream|2004|
|SONSKXP12A8C13A2C9|         Native Soul|2003|
|SOGDBUF12A8C140FAA|               Intro|2003|
+------------------+--------------------+----+
only showing top 10 rows



## Repartition

### DataFrames Repartition

In [114]:
song_df.rdd.getNumPartitions()

3

In [122]:
song_df_repartition = song_df.repartition(10)
song_df_repartion.rdd.getNumPartitions()

10

In [153]:
song_df.select(F.count('year')).show()

+-----------+
|count(year)|
+-----------+
|         70|
+-----------+



In [123]:
song_df_repartitionByRange = song_df.repartitionByRange(100, 'song_id')
song_df_repartitionByRange.rdd.getNumPartitions()

70

In [124]:
song_df_repartitionByRange = song_df.repartitionByRange(50, 'song_id')
song_df_repartitionByRange.rdd.getNumPartitions()

50

In [155]:
song_df.select(F.countDistinct('year')).show()

+--------------------+
|count(DISTINCT year)|
+--------------------+
|                  21|
+--------------------+



### Since countDistinct('year') is 21, so the max allowed repartition is 22 even if 50 is set

In [159]:
song_df_repartitionByRange2 = song_df.repartitionByRange(50, ['year'])
song_df_repartitionByRange2.rdd.getNumPartitions()

22

In [158]:
song_df_repartitionByRange3 = song_df.repartitionByRange(50, ['year', 'song_id'])
song_df_repartitionByRange3.rdd.getNumPartitions()

50

### RDD Repartition

In [133]:
song_rdd = song_df.rdd

In [31]:
song_rdd.collect()[0]

Row(artist_id='ARDR4AC1187FB371A1', artist_latitude=None, artist_location='', artist_longitude=None, artist_name='Montserrat Caballé;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti', duration=511.16363, num_songs=1, song_id='SOBAYLL12A8C138AF9', title='Sono andati? Fingevo di dormire', year=0)

In [138]:
song_rdd.collect()[0]['artist_id']

'ARDR4AC1187FB371A1'

In [139]:
song_rdd.collect()[0][0]

'ARDR4AC1187FB371A1'

In [140]:
song_rdd_filter = song_rdd.filter(lambda x: x[9] == 2004 )

In [141]:
song_rdd_filter.collect()

[Row(artist_id='ARMAC4T1187FB3FA4C', artist_latitude=40.82624, artist_location='Morris Plains, NJ', artist_longitude=-74.47995, artist_name='The Dillinger Escape Plan', duration=207.77751, num_songs=1, song_id='SOBBUGU12A8C13E95D', title='Setting Fire to Sleeping Giants', year=2004),
 Row(artist_id='ARP6N5A1187B99D1A3', artist_latitude=None, artist_location='Hamtramck, MI', artist_longitude=None, artist_name='Mitch Ryder', duration=207.43791, num_songs=1, song_id='SOXILUQ12A58A7C72A', title='Jenny Take a Ride', year=2004),
 Row(artist_id='ARVBRGZ1187FB4675A', artist_latitude=None, artist_location='', artist_longitude=None, artist_name='Gwen Stefani', duration=290.55955, num_songs=1, song_id='SORRZGD12A6310DBC3', title='Harajuku Girls', year=2004),
 Row(artist_id='ARYKCQI1187FB3B18F', artist_latitude=None, artist_location='', artist_longitude=None, artist_name='Tesla', duration=290.29832, num_songs=1, song_id='SOXLBJT12A8C140925', title='Caught In A Dream', year=2004)]

In [142]:
song_rdd.getNumPartitions()

3

In [143]:
song_rdd_repartition1 = song_rdd.repartition(23)
song_rdd_repartition1.getNumPartitions()

23

In [152]:
song_rdd_repartition2 = song_rdd.partitionBy(song_rdd.count()//5,'song_id')
song_rdd_repartition2.getNumPartitions()

14

In [149]:
song_rdd.count()

70

### Unlike for DataFrame repartition, RDD repartition will not check the distinct count of the partition element

In [161]:
song_rdd_repartition3 = song_rdd.partitionBy(1000, 'year')
song_rdd_repartition3.getNumPartitions()

1000