In [8]:
# RDDs are the main logical data units in Spark. They are a distributed collection of objects, 
# which are stored in memory or on disks of different machines of a cluster. A single RDD can be
# divided into multiple logical partitions so that these partitions can be stored
# and processed on different machines of a cluster.

In [9]:
# Resilience: RDDs track data lineage information to recover lost data, automatically on failure. 
# It is also called fault tolerance.

# Distributed: Data present in an RDD resides on multiple nodes. It is distributed across different nodes of a cluster.

# Lazy evaluation: Data does not get loaded in an RDD even if you define it. Transformations are actually computed 
# when you call an action, 
# such as count or collect, or save the output to a file system.

In [1]:
import pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('jvt').getOrCreate()

# Dats structure 1 : List 

data = [  ("James","Smith","USA","CA"),
          ("Michael","Rose","USA","NY"),
          ("Robert","Williams","USA","CA"),
           ("Maria","Jones","USA","FL")
  ]

print ( data[0:2] )

columns = ["firstname","lastname","country","state"]

# Dats structure 2 : Data frame 

df = spark.createDataFrame(data = data, schema = columns)

df.show(truncate=False)

df.head()


[('James', 'Smith', 'USA', 'CA'), ('Michael', 'Rose', 'USA', 'NY')]
+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|James    |Smith   |USA    |CA   |
|Michael  |Rose    |USA    |NY   |
|Robert   |Williams|USA    |CA   |
|Maria    |Jones   |USA    |FL   |
+---------+--------+-------+-----+



Row(firstname='James', lastname='Smith', country='USA', state='CA')

In [2]:
df.select("firstname","lastname").show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+



In [None]:
from pyspark.sql.functions import col

df.select(col("firstname"),col("lastname")).show()

In [13]:

data = [
        ( ("James",None,"Smith"),"OH","M"),
        (("Anna","Rose",""),"NY","F"),
        (("Julia","","Williams"),"OH","F"),
        (("Maria","Anne","Jones"),"NY","M"),
        (("Jen","Mary","Brown"),"NY","M"),
        (("Mike","Mary","Williams"),"OH","M")
        ]

from pyspark.sql.types import StructType,StructField, StringType 


schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
     ])


df2 = spark.createDataFrame(data = data, schema = schema)

df2.printSchema()

df2.show(truncate=False) # shows all columns


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+-----+------+
|name                  |state|gender|
+----------------------+-----+------+
|[James,, Smith]       |OH   |M     |
|[Anna, Rose, ]        |NY   |F     |
|[Julia, , Williams]   |OH   |F     |
|[Maria, Anne, Jones]  |NY   |M     |
|[Jen, Mary, Brown]    |NY   |M     |
|[Mike, Mary, Williams]|OH   |M     |
+----------------------+-----+------+



In [14]:
df2.select("name").show(truncate=False)

+----------------------+
|name                  |
+----------------------+
|[James,, Smith]       |
|[Anna, Rose, ]        |
|[Julia, , Williams]   |
|[Maria, Anne, Jones]  |
|[Jen, Mary, Brown]    |
|[Mike, Mary, Williams]|
+----------------------+



In [14]:
df2.select("name.firstname","name.lastname").show(truncate=False)

+---------+--------+
|firstname|lastname|
+---------+--------+
|James    |Smith   |
|Anna     |        |
|Julia    |Williams|
|Maria    |Jones   |
|Jen      |Brown   |
|Mike     |Williams|
+---------+--------+



In [16]:
df2.select("name.*").show(truncate=False)

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
|James    |null      |Smith   |
|Anna     |Rose      |        |
|Julia    |          |Williams|
|Maria    |Anne      |Jones   |
|Jen      |Mary      |Brown   |
|Mike     |Mary      |Williams|
+---------+----------+--------+



In [None]:
map()	Returns a new RDD by applying the function on each data element

filter()	Returns a new RDD formed by selecting those elements of the source on which the function returns true

reduceByKey()	Aggregates the values of a key using a function

groupByKey()	Converts a (key, value) pair into a (key, <iterable value>) pair

union()	Returns a new RDD that contains all elements and arguments from the source RDD

intersection()	Returns a new RDD that contains an intersection of the elements in the datasets

In [None]:

count()	Gets the number of data elements in an RDD

collect()	Gets all the data elements in an RDD as an array

reduce()	Aggregates data elements into an RDD by taking two arguments and returning one

take(n)	Fetches the first n elements of an RDD

foreach(operation)	Executes the operation for each data element in an RDD

first()	Retrieves the first data element of an RDD


In [15]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('gogetit').getOrCreate()

simpleData = [("James",34,"2006-01-01","true","M",3000.60),
    ("Michael",33,"1980-01-10","true","F",3300.80),
    ("Robert",37,"06-01-1992","false","M",5000.50)
  ]

columns = ["firstname","age","jobStartDate","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- age: long (nullable = true)
 |-- jobStartDate: string (nullable = true)
 |-- isGraduated: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)

+---------+---+------------+-----------+------+------+
|firstname|age|jobStartDate|isGraduated|gender|salary|
+---------+---+------------+-----------+------+------+
|James    |34 |2006-01-01  |true       |M     |3000.6|
|Michael  |33 |1980-01-10  |true       |F     |3300.8|
|Robert   |37 |06-01-1992  |false      |M     |5000.5|
+---------+---+------------+-----------+------+------+



In [2]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType,BooleanType,DateType

df2 = df.withColumn("age",col("age").cast(StringType())) \
    .withColumn("isGraduated",col("isGraduated").cast(BooleanType())) \
    .withColumn("jobStartDate",col("jobStartDate").cast(DateType()))
df2.printSchema()

df3 = df2.selectExpr("cast(age as int) age",
    "cast(isGraduated as string) isGraduated",
    "cast(jobStartDate as string) jobStartDate")
df3.printSchema()
df3.show(truncate=False)

df3.createOrReplaceTempView("CastExample")
df4 = spark.sql("SELECT STRING(age),BOOLEAN(isGraduated),DATE(jobStartDate) from CastExample")
df4.printSchema()
df4.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- age: string (nullable = true)
 |-- jobStartDate: date (nullable = true)
 |-- isGraduated: boolean (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)

root
 |-- age: integer (nullable = true)
 |-- isGraduated: string (nullable = true)
 |-- jobStartDate: string (nullable = true)

+---+-----------+------------+
|age|isGraduated|jobStartDate|
+---+-----------+------------+
|34 |true       |2006-01-01  |
|33 |true       |1980-01-10  |
|37 |false      |null        |
+---+-----------+------------+

root
 |-- age: string (nullable = true)
 |-- isGraduated: boolean (nullable = true)
 |-- jobStartDate: date (nullable = true)

+---+-----------+------------+
|age|isGraduated|jobStartDate|
+---+-----------+------------+
|34 |true       |2006-01-01  |
|33 |true       |1980-01-10  |
|37 |false      |null        |
+---+-----------+------------+



In [None]:
.ArrayType()
.IntegerType()
.LongType()
.StructField()
.StructType()
.StringType()
.DoubleType()
.Row()
.FloatType()
.BinaryType()
.TimestampType()
.DataType()

In [4]:
from pyspark.sql.session import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType

appName = "PySpark Example - JSON file to Spark Data Frame"
master = "local"

# Create Spark session
spark = SparkSession.builder \
    .appName(appName) \
    .master(master) \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()


schema = StructType([
    StructField('ID', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('DOB', StringType(), True),
    StructField('Gender', StringType(), True),
    StructField('Age', IntegerType(), True)
])


json_file_path = 'C:/Users/jvt/PYSPARKTRAINING/24septsolutions/file.txt'
json_file_path1 = 'C:/Users/jvt/PYSPARKTRAINING/24septsolutions/jsaonoutput.json'

df1 = spark.read.json(json_file_path, schema, multiLine=True)
df1.show(4)


df2 = spark.read.json(json_file_path1, schema, multiLine=True)
d=df1.join(df2, df1.ID == df2.ID).filter(df1.ID==1000)

d.show()

+----+----+----+------+----+
|  ID|name| DOB|Gender| Age|
+----+----+----+------+----+
|null|null|null|  null|null|
+----+----+----+------+----+

+---+----+---+------+---+---+----+---+------+---+
| ID|name|DOB|Gender|Age| ID|name|DOB|Gender|Age|
+---+----+---+------+---+---+----+---+------+---+
+---+----+---+------+---+---+----+---+------+---+



In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType

appName = "PySpark Example - JSON file to Spark Data Frame"
master = "local"

# Create Spark session
spark = SparkSession.builder \
    .appName(appName) \
    .master(master) \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

schema = StructType([
    StructField('ID', StringType(), True),
    StructField('name', StringType(), True),
    StructField('DOB', StringType(), True),
    StructField('Gender', StringType(), True),
    StructField('Age', StringType(), True)
])


json_file_path = 'C:/Users/jvt/PYSPARKTRAINING/24septsolutions/data.json'
df1 = spark.read.json(json_file_path, schema, multiLine=True)
print(df1)
# data = json.load(df1)
df_json = df1.toJSON()
print(df_json)

for row in df1.rdd.collect():
    print(row.name,row.Age)





DataFrame[ID: string, name: string, DOB: string, Gender: string, Age: string]
MapPartitionsRDD[110] at toJavaRDD at <unknown>:0
John Smith  20
Jim McDonald  25
Jim McDonald 25


In [1]:
from collections import namedtuple

user_row = namedtuple('user_row', 'dob age is_fan'.split())

data = [
    user_row('1990-05-03', 29, True),
    user_row('1994-09-23', 25, False)
]

In [2]:
user_row

__main__.user_row

In [3]:
data

[user_row(dob='1990-05-03', age=29, is_fan=True),
 user_row(dob='1994-09-23', age=25, is_fan=False)]

In [4]:
from pyspark.context import SparkContext

from pyspark.sql.session import SparkSession

sc = SparkContext('local')

spark = SparkSession(sc)

In [6]:

user_df = spark.createDataFrame(data)

user_df.printSchema()

root
 |-- dob: string (nullable = true)
 |-- age: long (nullable = true)
 |-- is_fan: boolean (nullable = true)



In [7]:
user_df.show()

+----------+---+------+
|       dob|age|is_fan|
+----------+---+------+
|1990-05-03| 29|  true|
|1994-09-23| 25| false|
+----------+---+------+



In [19]:
data_list = [
    ('1990-05-03', 29, True),
    ('1994-09-23', 25, False)
]

data = [ {'dob': r[0], 'age': r[1], 'is_fan': r[2]} for r in data_list  ]

user_df = spark.createDataFrame(data)



In [20]:
user_df

DataFrame[age: bigint, dob: string, is_fan: boolean]

In [21]:
user_df.printSchema()

root
 |-- age: long (nullable = true)
 |-- dob: string (nullable = true)
 |-- is_fan: boolean (nullable = true)



In [22]:
user_df.show()

+---+----------+------+
|age|       dob|is_fan|
+---+----------+------+
| 29|1990-05-03|  true|
| 25|1994-09-23| false|
+---+----------+------+



In [14]:
from pyspark.sql import Row

user_row = Row("dob", "age", "is_fan")

data = [
    user_row('1990-05-03', 29, True),
    user_row('1994-09-23', 25, False)
]

In [15]:
data

[Row(dob='1990-05-03', age=29, is_fan=True),
 Row(dob='1994-09-23', age=25, is_fan=False)]

In [16]:
user_df = spark.createDataFrame(data)
user_df.show()

+----------+---+------+
|       dob|age|is_fan|
+----------+---+------+
|1990-05-03| 29|  true|
|1994-09-23| 25| false|
+----------+---+------+



In [17]:
data = [
    ('1990-05-03', 29, True),
    ('1994-09-23', 25, False)
]
df = spark.createDataFrame(data, ['dob', 'age', 'is_fan'])
df.show()

+----------+---+------+
|       dob|age|is_fan|
+----------+---+------+
|1990-05-03| 29|  true|
|1994-09-23| 25| false|
+----------+---+------+



In [7]:
import pyspark.sql.types as st

data = [
    ('1990-05-03', 29, True),
    ('1994-09-23', 25, False)
]

user_schema = st.StructType([
    st.StructField('dob', st.StringType(), True),
    st.StructField('age', st.IntegerType(), True),
    st.StructField('is_fan', st.BooleanType(), True)
])

user_df = spark.createDataFrame(data, user_schema)
user_df.show()

+----------+---+------+
|       dob|age|is_fan|
+----------+---+------+
|1990-05-03| 29|  true|
|1994-09-23| 25| false|
+----------+---+------+



In [19]:
data = [
    ('1990-05-03', 29, True),
    ('1994-09-23', 25, False)
]

user_schema = "dob:string, age:int, is_fan: boolean"

user_df = spark.createDataFrame(data, user_schema)
user_df.show()

+----------+---+------+
|       dob|age|is_fan|
+----------+---+------+
|1990-05-03| 29|  true|
|1994-09-23| 25| false|
+----------+---+------+



In [23]:
import pandas as pd

iris_data = pd.read_json("https://raw.githubusercontent.com/domoritz/maps/master/data/iris.json")

iris_data.head(2)

Unnamed: 0,petalLength,petalWidth,sepalLength,sepalWidth,species
0,1.4,0.2,5.1,3.5,setosa
1,1.4,0.2,4.9,3.0,setosa


In [24]:
import json
import pyspark.sql.types as st

schema_json_str = """
{
  "type": "struct",
  "fields": [
    {
      "name": "dob",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "age",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "is_fan",
      "type": "boolean",
      "nullable": true,
      "metadata": {}
    }
  ]
}
"""

# Parse JSON string into python dictionary
schema_dict = json.loads(schema_json_str)

print(schema_dict)

# Create StructType from python dictionary
schema = st.StructType.fromJson(schema_dict)


data = [
    ('1990-05-03', 29, True),
    ('1994-09-23', 25, False)
]

user_df = spark.createDataFrame(data, schema)
user_df.show()



{'type': 'struct', 'fields': [{'name': 'dob', 'type': 'string', 'nullable': True, 'metadata': {}}, {'name': 'age', 'type': 'integer', 'nullable': True, 'metadata': {}}, {'name': 'is_fan', 'type': 'boolean', 'nullable': True, 'metadata': {}}]}
+----------+---+------+
|       dob|age|is_fan|
+----------+---+------+
|1990-05-03| 29|  true|
|1994-09-23| 25| false|
+----------+---+------+

