https://runawayhorse001.github.io/LearningApacheSpark/rdd.html

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-12-RDD")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

In [25]:
# dummy data
data = [(10, {3: 3.616726727464709, 4: 2.9996439803387602, 5: 1.6767412921625855}),
        (1, {3: 2.016527311459324, 4: -1.5271512313750577, 5: 1.9665475696370045}),
        (2, {3: 6.230272144805092, 4: 4.033642544526678, 5: 3.1517805604906313}),
        (3, {3: -0.3924680103722977, 4: 2.9757316477407443, 5: -1.5689126834176417})]

# create your rdd
rdd = spark.sparkContext.parallelize(data)

# convert to spark data frame
df = rdd.toDF(["CId", "Values"])
df.select("CId", F.explode("Values").alias("IID", "Score")).show()

+---+---+-------------------+
|CId|IID|              Score|
+---+---+-------------------+
| 10|  3|  3.616726727464709|
| 10|  4| 2.9996439803387602|
| 10|  5| 1.6767412921625855|
|  1|  3|  2.016527311459324|
|  1|  4|-1.5271512313750577|
|  1|  5| 1.9665475696370045|
|  2|  3|  6.230272144805092|
|  2|  4|  4.033642544526678|
|  2|  5| 3.1517805604906313|
|  3|  3|-0.3924680103722977|
|  3|  4| 2.9757316477407443|
|  3|  5|-1.5689126834176417|
+---+---+-------------------+



In [26]:
type(rdd)

pyspark.rdd.RDD

In [19]:
# create using parallelize()

myData = spark.sparkContext.parallelize([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
myData.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [20]:
type(myData)

pyspark.rdd.RDD

In [21]:
myData.toDF(["id"]).show()

ValueError: The first row in RDD is empty, can not infer schema

In [15]:
df3 = spark.range(10).rdd.toDF()
df3.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [5]:
df = (spark.sparkContext.parallelize(
        [(1, 2, 3, 'a b c'),
         (4, 5, 6, 'd e f'),
         (7, 8, 9, 'g h i')])
      .toDF(['col1', 'col2', 'col3','col4'])
     )

In [6]:
df.show()

+----+----+----+-----+
|col1|col2|col3| col4|
+----+----+----+-----+
|   1|   2|   3|a b c|
|   4|   5|   6|d e f|
|   7|   8|   9|g h i|
+----+----+----+-----+



In [7]:
df.collect()

[Row(col1=1, col2=2, col3=3, col4='a b c'),
 Row(col1=4, col2=5, col3=6, col4='d e f'),
 Row(col1=7, col2=8, col3=9, col4='g h i')]

In [8]:
# using createDataFrame( ) function

Employee = (spark.createDataFrame(
       [('1', 'Joe',   '70000', '1'),
        ('2', 'Henry', '80000', '2'),
        ('3', 'Sam',   '60000', '2'),
        ('4', 'Max',   '90000', '1')],
        ['Id', 'Name', 'Sallary','DepartmentId'])
    )

In [9]:
Employee.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sallary: string (nullable = true)
 |-- DepartmentId: string (nullable = true)



In [10]:
Employee.show()

+---+-----+-------+------------+
| Id| Name|Sallary|DepartmentId|
+---+-----+-------+------------+
|  1|  Joe|  70000|           1|
|  2|Henry|  80000|           2|
|  3|  Sam|  60000|           2|
|  4|  Max|  90000|           1|
+---+-----+-------+------------+



In [27]:
my_list = [['a', 1, 2], ['b', 2, 3],['c', 3, 4]]
col_name = ['A', 'B', 'C']

In [37]:
import pandas as pd
import numpy as np

In [29]:
pdf = pd.DataFrame(my_list,columns= col_name)

In [30]:
pdf.head()

Unnamed: 0,A,B,C
0,a,1,2
1,b,2,3
2,c,3,4


In [32]:
sdf = spark.createDataFrame(my_list, col_name)
sdf.show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  a|  1|  2|
|  b|  2|  3|
|  c|  3|  4|
+---+---+---+



In [33]:
d = {'A': [0, 1, 0],
     'B': [1, 0, 1],
     'C': [1, 0, 0]}

In [35]:
pdf = pd.DataFrame(d)
pdf.head()

Unnamed: 0,A,B,C
0,0,1,1
1,1,0,0
2,0,1,0


In [43]:
np.array(list(d.values())).T.tolist()

[[0, 1, 1], [1, 0, 0], [0, 1, 0]]

In [38]:
sdf = spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys()))
sdf.show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  0|  1|  1|
|  1|  0|  0|
|  0|  1|  0|
+---+---+---+



In [44]:
pdf.dtypes, sdf.dtypes

(A    int64
 B    int64
 C    int64
 dtype: object,
 [('A', 'bigint'), ('B', 'bigint'), ('C', 'bigint')])

In [45]:
pdf.columns, sdf.columns

(Index(['A', 'B', 'C'], dtype='object'), ['A', 'B', 'C'])