In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os

spark = SparkSession.builder.master("local").appName("Online2").getOrCreate()
sc = spark.sparkContext

In [3]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")
spark.conf.set("spark.sql.execution.arrow.enabled",True)
spark.conf.set("spark.sql.execution.arrow.fallback.enabled",True)

"""
Online coding question
1.Read the file as spark RDD not DF
2.Filter header from RDD
3.Calcualte final price (price = size * price persqft)
4.save the file to textfile with header as pipe delimited

#Since we need a single text file as O/P we use coalesce of 1 , which merges all part files

"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("header",True) \
            .option("inferSchema",True) \
            .option("delimiter",",") \
            .csv(filepath + "IntPersonal_transactions.csv")

In [12]:
df1 = sc.textFile(filepath + "IntRealEstate.txt") \
        .mapPartitionsWithIndex(lambda idx, iter: list(iter)[1:] if (idx==0) else iter) \
        .map(lambda x: x.split("|"))

In [14]:
df1.count()

18

In [32]:
df2 = sc.textFile(filepath + "IntRealEstate.txt") \
        .zipWithIndex() \
        .filter(lambda x: x[1] > 0) \
        .map(lambda x: x[0]) \
        .map(lambda x: x.split("|"))

df2.take(5)

[['1461262',
  'Arroyo Grande',
  '795000',
  '3',
  '3',
  '2371',
  '365.3',
  'Short Sale'],
 ['1478004',
  'Paulo Pablo',
  '399000',
  '4',
  '3',
  '2818',
  '163.59',
  'Short Sale'],
 ['1486551',
  'Paulo Pablo',
  '545000',
  '4',
  '3',
  '3032',
  '179.75',
  'Short Sale'],
 ['1492832', 'Santa Bay', '909000', '4', '4', '3540', '286.78', 'Short Sale'],
 ['1499102',
  'Thomas Country',
  '109900',
  '3',
  '1',
  '1249',
  '98.99',
  'Short Sale']]

In [22]:
#Apply filter to get header and data
rdd_in = sc.textFile(filepath + "IntRealEstate.txt")

rdd = rdd_in.filter(lambda x: not x.startswith("Property_ID"))
header = rdd_in.filter(lambda x: x.startswith("Property_ID"))

rdd.take(5)

['1461262|Arroyo Grande|795000|3|3|2371|365.3|Short Sale',
 '1478004|Paulo Pablo|399000|4|3|2818|163.59|Short Sale',
 '1486551|Paulo Pablo|545000|4|3|3032|179.75|Short Sale',
 '1492832|Santa Bay|909000|4|4|3540|286.78|Short Sale',
 '1499102|Thomas Country|109900|3|1|1249|98.99|Short Sale']

In [33]:
#Apply flatmap and map
rdd1 = rdd.flatMap(lambda x: x.split(",")).map(lambda x: x.split("|"))
rdd1.take(5)

[['1461262',
  'Arroyo Grande',
  '795000',
  '3',
  '3',
  '2371',
  '365.3',
  'Short Sale'],
 ['1478004',
  'Paulo Pablo',
  '399000',
  '4',
  '3',
  '2818',
  '163.59',
  'Short Sale'],
 ['1486551',
  'Paulo Pablo',
  '545000',
  '4',
  '3',
  '3032',
  '179.75',
  'Short Sale'],
 ['1492832', 'Santa Bay', '909000', '4', '4', '3540', '286.78', 'Short Sale'],
 ['1499102',
  'Thomas Country',
  '109900',
  '3',
  '1',
  '1249',
  '98.99',
  'Short Sale']]

In [39]:
#Get the index of each column
col_list = header.first().split("|")
col_list

['Property_ID',
 'Location',
 'Price',
 'Bedrooms',
 'Bathrooms',
 'Size',
 'Price_SQ_FT',
 'Status']

In [41]:
f1 = col_list.index("Property_ID")
f2 = col_list.index("Location")
f3 = col_list.index("Price")
f4 = col_list.index("Price_SQ_FT")

In [48]:
def mul_price(d1,d2):
    res = float(d1) * float(d2)
    return str(res)

header_out = header.map(lambda x: x.split("|")[f1] + "|" + x.split("|")[f2] + "|FinalPrice")
rdd2 = rdd1.map(lambda x: x[f1] + "|" + x[f2] + "|" + mul_price(x[f3],x[f4]) )

rdd_out = header_out.union(rdd2)
rdd_out.collect()

['Property_ID|Location|FinalPrice',
 '1461262|Arroyo Grande|290413500.0',
 '1478004|Paulo Pablo|65272410.0',
 '1486551|Paulo Pablo|97963750.0',
 '1492832|Santa Bay|260683019.99999997',
 '1499102|Thomas Country|10879001.0',
 '1489132|Thomas Country|10244910.0',
 '1467262|Fort Worth|459251100.0',
 '1478114|Paulo Pablo|91284710.0',
 '1402551|Nashville|92513750.0',
 '1405832|San Jose|285160400.0',
 '1493302|Fort Worth|23858901.0',
 '1412332|Thomas Country|9544710.0',
 '1469062|Arroyo Grande|188248200.0',
 '1498004|Nashville|207972509.99999997',
 '1586751|Nashville|178976000.0',
 '1433232|Glendale|213961860.0',
 '1495502|Fort Worth|44197701.0',
 '1489100|San Jose|8467728.0']

In [50]:
#Since we need a single text file as O/P we use coalesce of 1 , which merges all part files
rdd_out.coalesce(1).saveAsTextFile(filepath + "Online/realestate.txt")