In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [2]:
# create DataFrame through RDD

columns = ['currency', 'value']
inputdata = [('Euro', 90), ('Pound', 100), ('Yuan', 11), ('Yen', 2), ('US Dollar', 74), ('K Dinar', 242)]

# RDD
rdd = spark.sparkContext.parallelize(inputdata)
rddDF = rdd.toDF()
df = rddDF.withColumnRenamed('_1', 'Currency')
#df.show()
df = spark.createDataFrame(rdd).toDF(*columns)
#df.show()

In [3]:
# Create DataFrame

df = spark.createDataFrame(data=inputdata, schema=columns)
#df.show()

In [4]:
#Writing df content to CSV

df.write.format('csv').save('D:/test')

In [5]:
# write without partition
df.repartition(1).write.format('csv').save('D:/csv', header=True)

In [6]:
# Write DF content to text file through RDD
df.rdd.map(lambda x: x[0] + "," +str(x[1])).repartition(1).saveAsTextFile('D:/Text')

In [7]:
# Reading Data to DataFrame from datasource (csv)
df = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load('D:/Dataset/flight-data/csv/2015-summary.csv')
#df.show()

In [8]:
from pyspark.sql.functions import col, expr, column, udf, date_sub, date_add, col, datediff, regexp_extract
from pyspark.sql.types import StringType, IntegerType

# various ways of selecting columns

#df.select(col('DEST_COUNTRY_NAME')).show(5, False)

#df.select(column('DEST_COUNTRY_NAME')).show(5, False)

#df.select('DEST_COUNTRY_NAME').show(5, False)

#df.select('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME').show(5, False)

#df.select(expr('DEST_COUNTRY_NAME AS Destination')) .show(5, False)
#df.select('DEST_COUNTRY_NAME').show(5, False)


In [9]:
# add column to dataframe
df.withColumn('withinCountry', expr('ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME')).show(5, False)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|United States    |Romania            |15   |false        |
|United States    |Croatia            |1    |false        |
|United States    |Ireland            |344  |false        |
|Egypt            |United States      |15   |false        |
|United States    |India              |62   |false        |
+-----------------+-------------------+-----+-------------+
only showing top 5 rows



In [10]:
# Adding column by user defined function

def computeGroup(count):
    if count < 2:
        return 'Min'
    elif count < 20:
        return 'Normal'
    elif count < 100:
        return 'More'
    else:
        return 'Busy'

group_udf = udf(computeGroup, StringType())
df.withColumn('Frequency', group_udf(col('count'))).show(5, False)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|Frequency|
+-----------------+-------------------+-----+---------+
|United States    |Romania            |15   |Normal   |
|United States    |Croatia            |1    |Min      |
|United States    |Ireland            |344  |Busy     |
|Egypt            |United States      |15   |Normal   |
|United States    |India              |62   |More     |
+-----------------+-------------------+-----+---------+
only showing top 5 rows



In [11]:
# Movie Dataset

movieDf = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load('D:/Dataset/movie.csv')
#movieDf.show()

movieYear = movieDf.withColumn('Year',regexp_extract(col('title'), r"(\d\d\d\d)", 1).cast(IntegerType()))
cleanedMovie = movieYear.na.drop()


In [12]:
def calDecade(years):
    return (years - years%10)

decadeudf = udf(calDecade, IntegerType())
movieDecade = cleanedMovie.withColumn('Decade', decadeudf(col('Year')).cast(IntegerType()))
#movieDecade.show()

#movieDecade.filter('Decade == 1990').show()