## Setting Environment variables

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## Create a Spark Session

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("Map and FlatMap")\
        .getOrCreate()

## Operation over RDD

### Create RDD

In [3]:
data = ["Project Gutenberg’s","Alice’s Adventures",
"in Wonderland","Project Gutenberg’s Adventures",
"in Wonderland","Project Gutenberg’s"]

In [4]:
rdd = spark.sparkContext.parallelize(data)

### Map()

In [5]:
rdd2 = rdd.map(lambda x : (x, x.upper()))
for ele in rdd2.collect():
    print(ele)

('Project Gutenberg’s', 'PROJECT GUTENBERG’S')
('Alice’s Adventures', 'ALICE’S ADVENTURES')
('in Wonderland', 'IN WONDERLAND')
('Project Gutenberg’s Adventures', 'PROJECT GUTENBERG’S ADVENTURES')
('in Wonderland', 'IN WONDERLAND')
('Project Gutenberg’s', 'PROJECT GUTENBERG’S')


### Map()

In [6]:
rdd2 = rdd.map(lambda x: x.upper())
for ele in rdd2.collect():
    print(ele)

PROJECT GUTENBERG’S
ALICE’S ADVENTURES
IN WONDERLAND
PROJECT GUTENBERG’S ADVENTURES
IN WONDERLAND
PROJECT GUTENBERG’S


### FlatMap()

In [7]:
rdd2 = rdd.flatMap(lambda x : x.split(" "))
for ele in rdd2.collect():
    print(ele)

Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
Project
Gutenberg’s
Adventures
in
Wonderland
Project
Gutenberg’s


## Operations over DataFrame

### Creating DataFrame

In [8]:
data = [
    ('James', 'Smith', 'M', 30),
    ('Anna', 'Jones', 'F', 41),
    ('Robert', 'Williams', 'M', 60)
]

columns = ["First_name", "Last_name", "gender", "Salary"]

df = spark.createDataFrame(data, columns)
df.printSchema()

root
 |-- First_name: string (nullable = true)
 |-- Last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Salary: long (nullable = true)



### Map

In [9]:
# Referring Columns by Index
rdd2 = df.rdd.map(lambda x:
              (x[0]+' '+x[1],x[2], x[3]*2)
              )
df2 = rdd2.toDF(["full_name", "gender","salary"])
df2.show(truncate = False)

+---------------+------+------+
|full_name      |gender|salary|
+---------------+------+------+
|James Smith    |M     |60    |
|Anna Jones     |F     |82    |
|Robert Williams|M     |120   |
+---------------+------+------+



In [10]:
rdd3 = df.rdd

In [11]:
for x in rdd3.collect():
    print(x["First_name"])

James
Anna
Robert


In [12]:
# By calling the function
def funct1(x):
    firstname = x.First_name
    lastname = x.Last_name
    name = firstname + lastname
    gender = x.gender.lower()
    sal = x.Salary * 2
    return (name, gender, sal)

In [13]:
rdd2 = df.rdd.map(lambda x: funct1(x))

In [14]:
df2 = rdd2.toDF(["full_name", "gender", "sal"])
df2.show()

+--------------+------+---+
|     full_name|gender|sal|
+--------------+------+---+
|    JamesSmith|     m| 60|
|     AnnaJones|     f| 82|
|RobertWilliams|     m|120|
+--------------+------+---+



In [15]:
rdd2.collect()

[('JamesSmith', 'm', 60), ('AnnaJones', 'f', 82), ('RobertWilliams', 'm', 120)]

### Flatmap()

In [16]:
arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]
df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])

In [18]:
from pyspark.sql.functions import explode
df2 = df.select(df.name, explode(df.properties))
df.printSchema()
df.show()
df2.show()

root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+-------------------+--------------------+
|      name|     knownLanguages|          properties|
+----------+-------------------+--------------------+
|     James|      [Java, Scala]|{eye -> brown, ha...|
|   Michael|[Spark, Java, null]|{eye -> null, hai...|
|    Robert|         [CSharp, ]|{eye -> , hair ->...|
|Washington|               null|                null|
| Jefferson|             [1, 2]|                  {}|
+----------+-------------------+--------------------+

+-------+----+-----+
|   name| key|value|
+-------+----+-----+
|  James| eye|brown|
|  James|hair|black|
|Michael| eye| null|
|Michael|hair|brown|
| Robert| eye|     |
| Robert|hair|  red|
+-------+----+-----+

