# Spark using Python

In [None]:
!pip install pyspark
!pip install findspark

In [None]:
import findspark

In [None]:
findspark.find()

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

## create the spark session and context

In [None]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

## Initialize Spark session

In [None]:
spark

## RDDs : Resilient Distributed Datasets

### Create an RDD : which has integers from 1 to 30

Create the RDD by "sc.parallelize()"

In [None]:
data = range(1,30)
xRDD = sc.parallelize(data,4)

In [None]:
xRDD

### Transformation

Reduce each element in the RDD by 1, another one is filter x<10

lambda is an inline function in Python

In [None]:
newRDD1 = xRDD.map(lambda x: x-1)
filteredRDD = newRDD.filter(lambda x : x<10)


## Actions

In [None]:
print(filteredRDD.collect()) ##print all data inside
filteredRDD.count()

## DataFrames and SparkSQL

In [None]:
df = spark.read.json("filename.json").cache()

In [None]:
# Print the dataframe as well as the data schema
df.show()
df.printSchema()

In [None]:
# Register the DataFrame as a SQL temporary view
df.createTempView("people")

In [None]:
##SQL
spark.sql("SELECT name FROM people").show()
spark.sql("SELECT age, name FROM people WHERE age > 21").show()
spark.sql("SELECT age, COUNT(age) as count FROM people GROUP BY age").show()