https://runawayhorse001.github.io/LearningApacheSpark/rdd.html

https://github.com/spark-examples/pyspark-examples.git

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-12-RDD-loop")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

In [17]:
columns = ["firstname", "lastname","gender","age", "salary"]
data = [
    ('James','Smith','M',30, 55000.0),
    ('Anna','Rose','F',41, 95000.0),
    ('Robert','Williams','m',62, 75000.0),     
    ('Frank','Coleman',None, 22, 45000.0),     
]

In [12]:
df = spark.createDataFrame(data=data, schema = columns)
df.show()

+---------+--------+------+---+-------+
|firstname|lastname|gender|age| salary|
+---------+--------+------+---+-------+
|    James|   Smith|     M| 30|55000.0|
|     Anna|    Rose|     F| 41|95000.0|
|   Robert|Williams|     M| 62|75000.0|
|    Frank| Coleman|  null| 22|45000.0|
+---------+--------+------+---+-------+



In [4]:
rdd = spark.sparkContext.parallelize(data)
df2 = rdd.toDF(columns)
df2.show()

+---------+--------+------+---+-------+
|firstname|lastname|gender|age| salary|
+---------+--------+------+---+-------+
|    James|   Smith|     M| 30|55000.0|
|     Anna|    Rose|     F| 41|95000.0|
|   Robert|Williams|     M| 62|75000.0|
+---------+--------+------+---+-------+



In [5]:
rdd=df.rdd.map(
    lambda x: (x[0], x[1], x[0]+","+x[1], x[2], x[3], x[4], x[4]*1.2)
)
df2 = rdd.toDF(["firstname", "lastname","name", "gender","age", "salary", "new_salary"])
df2.show()

+---------+--------+---------------+------+---+-------+----------+
|firstname|lastname|           name|gender|age| salary|new_salary|
+---------+--------+---------------+------+---+-------+----------+
|    James|   Smith|    James,Smith|     M| 30|55000.0|   66000.0|
|     Anna|    Rose|      Anna,Rose|     F| 41|95000.0|  114000.0|
|   Robert|Williams|Robert,Williams|     M| 62|75000.0|   90000.0|
+---------+--------+---------------+------+---+-------+----------+



In [7]:
#Referring Column Names
rdd2=df.rdd.map(lambda x: 
    # (x["firstname"]+","+x["lastname"],x["gender"],x["salary"]*1.2)
    (x.firstname+","+x.lastname,x.gender,x.salary*1.2)
    )
df2 = rdd2.toDF(["name", "gender","new_salary"])
df2.show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|    James,Smith|     M|   66000.0|
|      Anna,Rose|     F|  114000.0|
|Robert,Williams|     M|   90000.0|
+---------------+------+----------+



In [18]:
def func1(x):
    m = {"M": "Male", "F": "Female"}
    firstName=x.firstname
    lastName=x.lastname
    name=firstName+","+lastName
    gender = x.gender.upper() if x.gender else x.gender
    gender=m.get(gender, "Unknown")
    salary=x.salary*2
    return (name,gender,salary)

rdd2=df.rdd.map(lambda x: func1(x))
df2 = rdd2.toDF(["name", "gender","new_salary"])
df2.show()

+---------------+-------+----------+
|           name| gender|new_salary|
+---------------+-------+----------+
|    James,Smith|   Male|  110000.0|
|      Anna,Rose| Female|  190000.0|
|Robert,Williams|   Male|  150000.0|
|  Frank,Coleman|Unknown|   90000.0|
+---------------+-------+----------+



In [22]:
# print occurs in Executor, one does not see the result
df.rdd.foreach(lambda x: 
    print("Data ==>"+x["firstname"]+","+x["lastname"]+","+str(x["gender"])+","+str(x["salary"]*1.2))
    )

In [27]:
dataCollect=df.rdd.toLocalIterator()
for x in dataCollect:
    print("Row => "+x["firstname"]+", "+x["lastname"]+", "+str(x["gender"])+", "+str(x["salary"]*1.2))

Row => James, Smith, M, 66000.0
Row => Anna, Rose, F, 114000.0
Row => Robert, Williams, M, 90000.0
Row => Frank, Coleman, None, 54000.0


In [28]:
import pandas as pd
pandasDF = df.toPandas()
for index, x in pandasDF.iterrows():
    print(x['firstname'], x['gender'])

James M
Anna F
Robert M
Frank None
