### Initializing a SparkContext

In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)

In [6]:
sc

In [7]:
lines = sc.textFile('../Readme.md')
lines.count()

6

In [9]:
lines.first()

'### Exploration using Spark on interactive analysis'

In [14]:
filterLines = lines.filter(lambda line: "Docker" in line)
filterLines.count()

1

In [14]:
# Reading the file-
myTextFile = sc.textFile("../data/README.md")

In [15]:
# Removing the empty lines-
non_emptyLines = myTextFile.filter(lambda line: len(line) > 0)

In [16]:
# Return a new RDD "words" by first applying "split()" function to all elements of this RDD, and then flattening the results.
words = non_emptyLines.flatMap(lambda x: x.split(' '))

In [17]:
non_emptyLines.count()

65

In [20]:
words.first()

'#'

In [21]:
# Executing three different functions-
# a) .map() - it takes each line of the rdd "words" which is now a list of words, then creates a tuple like ('apple', 1) etc.
# b) .reduceByKey() - it merges the values for each key using an associative and commutative reduce function. e.g. ('apple', 5) etc.
# c) .map() - It just change the position on the tupple as (5, 'apple') and sorts the key descending
wordCount = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y : x + y).map(lambda x: (x[1], x[0])).sortByKey(False)

In [22]:
# Save this RDD as a text file, using string representations of elements.
# Note: It creates part-00000, part-00001 ... files which shows how the job has been performed across multiple partions (executor nodes)
wordCount.saveAsTextFile("wordCountResult")

In [23]:
# To make this as a single file, you can just repartion it using coalesce(). 
# It returns a new RDD that is reduced into `numPartitions` partitions.
wordCount.coalesce(1).saveAsTextFile("wordCountResult2")

In [24]:
sc.stop()

### Transformation

In [25]:
# Creating Spark Configuration and Spark Context-

from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("My Dataframe")
sc = SparkContext(conf = conf)

In [26]:
from pyspark.sql import SparkSession # To work with dataframe we need pyspark.sql
spark = SparkSession(sc) # passing Spark Context to SQL module

In [27]:
# myRange is a Spark DataFrame with one column containing 1,000 rows with values from 0 to 999.
# When run on a cluster, each part of this range of numbers exists on a different executor.
myRange = spark.range(1000).toDF("number")

In [28]:
# Let's perform a transformation-
divisBy2 = myRange.where("number % 2 = 0") # `where` is an alias for :func:`filter`.

In [34]:
# shows the first two records of the dataframe
myRange.show(2)

+------+
|number|
+------+
|     0|
|     1|
+------+
only showing top 2 rows



In [38]:
# gives total number of records in the dataframe
divisBy2.count()

500

In [39]:
# Returns all the records as a list
divisBy2.collect()

[Row(number=0),
 Row(number=2),
 Row(number=4),
 Row(number=6),
 Row(number=8),
 Row(number=10),
 Row(number=12),
 Row(number=14),
 Row(number=16),
 Row(number=18),
 Row(number=20),
 Row(number=22),
 Row(number=24),
 Row(number=26),
 Row(number=28),
 Row(number=30),
 Row(number=32),
 Row(number=34),
 Row(number=36),
 Row(number=38),
 Row(number=40),
 Row(number=42),
 Row(number=44),
 Row(number=46),
 Row(number=48),
 Row(number=50),
 Row(number=52),
 Row(number=54),
 Row(number=56),
 Row(number=58),
 Row(number=60),
 Row(number=62),
 Row(number=64),
 Row(number=66),
 Row(number=68),
 Row(number=70),
 Row(number=72),
 Row(number=74),
 Row(number=76),
 Row(number=78),
 Row(number=80),
 Row(number=82),
 Row(number=84),
 Row(number=86),
 Row(number=88),
 Row(number=90),
 Row(number=92),
 Row(number=94),
 Row(number=96),
 Row(number=98),
 Row(number=100),
 Row(number=102),
 Row(number=104),
 Row(number=106),
 Row(number=108),
 Row(number=110),
 Row(number=112),
 Row(number=114),
 Row(number