In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder.master("local").appName("PandastoSpark").getOrCreate()
sc = spark.sparkContext

In [8]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")
spark.conf.set("spark.sql.execution.arrow.enabled",True)
spark.conf.set("spark.sql.execution.arrow.fallback.enabled",True)

"""
Pandas DF to spark DF
use mapParititonsWithIndex
"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = sc.textFile(filepath + "IntPageview.csv").map(lambda x: x.split(","))
df.collect()

[['Site', 'www.learntospark.com'],
 ['Desccription', '"Complete guide to learn Spark', 'AI', 'ML"'],
 ['Page Views of each blog'],
 ['20200817-20200817'],
 [''],
 ['Total data in page', '12'],
 [''],
 ['Page', 'Date', 'Pageviews', 'Unique_Pageviews', 'Sessions'],
 ['Guide to Install Spark', '2020-08-17', '1140', '986', '800'],
 ['Spark MAP vs FlatMap', '2020-08-17', '836', '800', '128'],
 ['Spark Architechture', '2020-08-17', '1569', '1345', '1400'],
 ['Azure Function for Mp3 to text', '2020-08-17', '350', '245', '234'],
 ['Scala Vs Python', '2020-08-17', '200', '150', '130'],
 ['Spark Window Function', '2020-08-17', '789', '546', '560'],
 ['Natural Language Processing', '2020-08-17', '467', '456', '100'],
 ['Spark Linear Interpolation - Time Series',
  '2020-08-17',
  '698',
  '345',
  '349'],
 ['Spark case statement', '2020-08-17', '234', '196', '120'],
 ['Spark Scenario Based Questions', '2020-08-17', '712', '329', '137'],
 ['Spark v3.0 Delta Lake', '2020-08-17', '333', '198', '39']

In [7]:
df.getNumPartitions()

1

In [10]:
#Difference between map(row by row basis) and mapPartitionsWithIndex(parition by partition)
#Apply index to partition wise and omit if the partition index is zero
#Here iter[8:] is remove lines upto 8 from zero

rdd_drop = df.mapPartitionsWithIndex(lambda idx,iter: list(iter)[8:] if (idx==0) else iter)
rdd_drop.collect()

[['Guide to Install Spark', '2020-08-17', '1140', '986', '800'],
 ['Spark MAP vs FlatMap', '2020-08-17', '836', '800', '128'],
 ['Spark Architechture', '2020-08-17', '1569', '1345', '1400'],
 ['Azure Function for Mp3 to text', '2020-08-17', '350', '245', '234'],
 ['Scala Vs Python', '2020-08-17', '200', '150', '130'],
 ['Spark Window Function', '2020-08-17', '789', '546', '560'],
 ['Natural Language Processing', '2020-08-17', '467', '456', '100'],
 ['Spark Linear Interpolation - Time Series',
  '2020-08-17',
  '698',
  '345',
  '349'],
 ['Spark case statement', '2020-08-17', '234', '196', '120'],
 ['Spark Scenario Based Questions', '2020-08-17', '712', '329', '137'],
 ['Spark v3.0 Delta Lake', '2020-08-17', '333', '198', '39'],
 ['Screen Recorder App using Python', '2020-08-17', '766', '567', '344'],
 ['Spark trick with Show()', '2020-08-17', '108', '35', '24'],
 ['Spark Cache() Vs Persist', '2020-08-17', '587', '432', '300'],
 ['Image Processing text to audio', '2020-08-17', '384', '1

In [42]:
schema = ["Page", "Date", "PageViews", "UniqueViews","Session"]

In [43]:
df2 = spark.createDataFrame(rdd_drop,schema)

In [44]:
df2.show()

+--------------------+----------+---------+-----------+-------+
|                Page|      Date|PageViews|UniqueViews|Session|
+--------------------+----------+---------+-----------+-------+
|Guide to Install ...|2020-08-17|     1140|        986|    800|
|Spark MAP vs FlatMap|2020-08-17|      836|        800|    128|
| Spark Architechture|2020-08-17|     1569|       1345|   1400|
|Azure Function fo...|2020-08-17|      350|        245|    234|
|     Scala Vs Python|2020-08-17|      200|        150|    130|
|Spark Window Func...|2020-08-17|      789|        546|    560|
|Natural Language ...|2020-08-17|      467|        456|    100|
|Spark Linear Inte...|2020-08-17|      698|        345|    349|
|Spark case statement|2020-08-17|      234|        196|    120|
|Spark Scenario Ba...|2020-08-17|      712|        329|    137|
|Spark v3.0 Delta ...|2020-08-17|      333|        198|     39|
|Screen Recorder A...|2020-08-17|      766|        567|    344|
|Spark trick with ...|2020-08-17|      1

In [49]:
sc.textFile(filepath + "IntPageview.csv").zipWithIndex().filter(lambda x: x[1]>= 8).collect()

[('Guide to Install Spark,2020-08-17,1140,986,800', 8),
 ('Spark MAP vs FlatMap,2020-08-17,836,800,128', 9),
 ('Spark Architechture,2020-08-17,1569,1345,1400', 10),
 ('Azure Function for Mp3 to text,2020-08-17,350,245,234', 11),
 ('Scala Vs Python,2020-08-17,200,150,130', 12),
 ('Spark Window Function,2020-08-17,789,546,560', 13),
 ('Natural Language Processing,2020-08-17,467,456,100', 14),
 ('Spark Linear Interpolation - Time Series,2020-08-17,698,345,349', 15),
 ('Spark case statement,2020-08-17,234,196,120', 16),
 ('Spark Scenario Based Questions,2020-08-17,712,329,137', 17),
 ('Spark v3.0 Delta Lake,2020-08-17,333,198,39', 18),
 ('Screen Recorder App using Python,2020-08-17,766,567,344', 19),
 ('Spark trick with Show(),2020-08-17,108,35,24', 20),
 ('Spark Cache() Vs Persist,2020-08-17,587,432,300', 21),
 ('Image Processing text to audio,2020-08-17,384,123,84', 22)]

In [47]:
#(As zipWithIndex starts with 0) , here index column is present at last and is removed with second map transformation
#the map transformation splits each line by a comma (,), converting it into a list of strings. 
zipdf = sc.textFile(filepath + "IntPageview.csv").zipWithIndex().filter(lambda x: x[1]>= 8).map(lambda x: x[0].split(","))
df3 = spark.createDataFrame(zipdf,schema)
df3.show()

+--------------------+----------+---------+-----------+-------+
|                Page|      Date|PageViews|UniqueViews|Session|
+--------------------+----------+---------+-----------+-------+
|Guide to Install ...|2020-08-17|     1140|        986|    800|
|Spark MAP vs FlatMap|2020-08-17|      836|        800|    128|
| Spark Architechture|2020-08-17|     1569|       1345|   1400|
|Azure Function fo...|2020-08-17|      350|        245|    234|
|     Scala Vs Python|2020-08-17|      200|        150|    130|
|Spark Window Func...|2020-08-17|      789|        546|    560|
|Natural Language ...|2020-08-17|      467|        456|    100|
|Spark Linear Inte...|2020-08-17|      698|        345|    349|
|Spark case statement|2020-08-17|      234|        196|    120|
|Spark Scenario Ba...|2020-08-17|      712|        329|    137|
|Spark v3.0 Delta ...|2020-08-17|      333|        198|     39|
|Screen Recorder A...|2020-08-17|      766|        567|    344|
|Spark trick with ...|2020-08-17|      1