In [3]:
# Importing the packages
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as func

In [2]:
# Creating the SparkSession
spark = SparkSession.builder.appName("FirstApp").getOrCreate()

In [4]:
# Defining schema for your DataFrame
mySchema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("friends", IntegerType(), True),
])

In [5]:
# Creating DataFrame on a CSV file
people = spark.read.format("csv")\
.schema(mySchema)\
.option("path", "fakefriends.csv")\
.load()

In [6]:
people.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- friends: integer (nullable = true)



In [7]:
output = people.select(people.userID, people.name \
                      , people.age, people.friends)\
        .where(people.age < 30).withColumn('insert_ts', func.current_timestamp())\
        .orderBy(people.userID).cache()

In [8]:
output.createOrReplaceTempView("peoples")

In [9]:
spark.sql("SELECT userID, name FROM peoples").show()

+------+--------+
|userID|    name|
+------+--------+
|     1|Jean-Luc|
|     9|    Hugh|
|    16|  Weyoun|
|    21|   Miles|
|    24|  Julian|
|    25|     Ben|
|    26|  Julian|
|    32|     Nog|
|    35| Beverly|
|    46|    Morn|
|    47|   Brunt|
|    48|     Nog|
|    52| Beverly|
|    54|   Brunt|
|    60|  Geordi|
|    66|  Geordi|
|    72|  Kasidy|
|    73|   Brunt|
|    84|     Ben|
|    89|    Worf|
+------+--------+
only showing top 20 rows



In [None]:
output.write\
.format("parquet").mode("overwrite")\
.option("path", "C:/Users/shanb/SparkTraining/op")\
.partitionBy("age")\
.save()

In [None]:
output.write\
.format("csv").mode("overwrite")\
.option("path", "C:/Users/shanb/SparkTraining/op")\
.partitionBy("age")\
.save()

In [None]:
output.write\
.format("json").mode("overwrite")\
.option("path", "C:/Users/shanb/SparkTraining/op")\
.partitionBy("age")\
.save()

In [None]:
output.write\
.format("avro").mode("overwrite")\
.option("path", "C:/Users/shanb/SparkTraining/op")\
.partitionBy("age")\
.save()