# Loading csv file from Amazon S3 into iguazio File system

In [1]:
%%sh
mkdir /v3io/bigdata/examples/
curl -L "deutsche-boerse-xetra-pds.s3.amazonaws.com/2018-03-26/2018-03-26_BINS_XETR07.csv" > /v3io/bigdata/examples/stocks_example.csv

mkdir: cannot create directory '/v3io/bigdata/examples/': File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  975k  100  975k    0     0  6782k      0 --:--:-- --:--:-- --:--:-- 6822k


# Read and write the file using Spark DF

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Iguazio Integration demo").getOrCreate()

# Read the sample stocks.csv file into a Spark DataFrame, and let Spark infer the schema of the CSV file
myDF = spark.read.format("io.iguaz.v3io.spark.sql.kv").option("header", "true").option("inferSchema", "true").csv("v3io://bigdata/examples/stocks_example.csv")

# Write the DataFrame data to a stocks_nosql table under "bigdata" container and define "ISIN" column as a key
myDF.write.format("io.iguaz.v3io.spark.sql.kv").mode("append").option("key", "ISIN").save("v3io://bigdata/examples/stocks_tab/")

# Read iguazio table and writing it back as a CSV 

In [3]:
myDF2 = spark.read.format("io.iguaz.v3io.spark.sql.kv").load("v3io://bigdata/examples/stocks_tab").where("TradedVolume>20000")

# myDF2.write.csv('v3io://bigdata/examples/stocks_high_volume.csv')
myDF2.coalesce(1).write.csv('v3io://bigdata/examples/stocks_high_volume.csv')

# note that using coalesce(1) is for storing the output as a single file


## Remove files

In [5]:
!rm -rf /v3io/bigdata/examples/stock*