# Loading CSV file from Amazon S3 into iguazio file system or database

In [1]:
import pandas as pd
import v3io_frames as v3f
import os
client = v3f.Client('v3io-framesd:8081', container='users')

## Import sample file from S3 into iguazio file system (v3io)

In [3]:
%%sh
mkdir /v3io/bigdata/examples/
curl -L "deutsche-boerse-xetra-pds.s3.amazonaws.com/2018-03-26/2018-03-26_BINS_XETR07.csv" > $HOME/examples/stocks_example.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  975k  100  975k    0     0  9104k      0 --:--:-- --:--:-- --:--:-- 9117k


## Read the file using into a pandas DataFrame
Note the file can be read directly from HTTP into a DataFrame (if placing the full URL i.e. `pd.read_csv('http://deutsche-boerse...')`

In [2]:
# read S3 file into a data frame and show its data & metadata
df = pd.read_csv(os.path.join(os.environ['HOME'], 'examples/stocks_example.csv'))
df.set_index('ISIN', inplace=True)
df.head()

Unnamed: 0_level_0,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades
ISIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AT0000A0E9W5,SANT,S+T AG (Z.REG.MK.Z.)O.N.,Common stock,EUR,2504159,2018-03-26,07:00,20.56,20.56,20.56,20.56,1115,5
DE000A0WMPJ6,AIXA,AIXTRON SE NA O.N.,Common stock,EUR,2504428,2018-03-26,07:00,17.035,17.08,16.92,16.98,2892,11
DE000A0Z2XN6,RIB,RIB SOFTWARE SE NA EO 1,Common stock,EUR,2504436,2018-03-26,07:00,24.02,24.18,23.94,24.12,5721,34
DE000A0Z2ZZ5,FNTN,FREENET AG NA O.N.,Common stock,EUR,2504438,2018-03-26,07:00,24.72,24.72,24.7,24.72,315,2
DE000A1EWWW0,ADS,ADIDAS AG NA O.N.,Common stock,EUR,2504471,2018-03-26,07:00,196.35,196.4,195.6,195.9,5616,27


## Write file into iguazio database as key value table using v3io frames

In [3]:
tablename = 'iguazio/examples/stocks_example_tab'
client.write('kv', tablename, df)

## Read and write the file using Spark DF

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Iguazio Integration demo").getOrCreate()

# Read the sample stocks.csv file into a Spark DataFrame, and let Spark infer the schema of the CSV file
myDF = spark.read.format("io.iguaz.v3io.spark.sql.kv").option("header", "true").option("inferSchema", "true").csv("v3io://users/iguazio/examples/stocks_example.csv")

# Write the DataFrame data to a stocks_nosql table under "bigdata" container and define "ISIN" column as a key
myDF.write.format("io.iguaz.v3io.spark.sql.kv").mode("append").option("key", "ISIN").save("v3io://users/iguazio/examples/stocks_tab_by_spark/")

## Read iguazio table and writing it back as a CSV 

In [5]:
myDF2 = spark.read.format("io.iguaz.v3io.spark.sql.kv").load("v3io://users/iguazio/examples/stocks_tab_by_spark").where("TradedVolume>20000")

# myDF2.write.csv('v3io://bigdata/examples/stocks_high_volume.csv')
myDF2.coalesce(1).write.csv('v3io://users/iguazio/examples/stocks_high_volume.csv')

# note that using coalesce(1) is for storing the output as a single file


## Viewing files 
Note: the table will apear as a directory under v3io file system

In [6]:
!ls -l $HOME/examples/

total 0
-rw-r--r--. 1 50 nogroup 999016 Jan 13 12:40 stocks.csv
-rw-r--r--. 1 50 nogroup 999016 Jan 13 12:42 stocks_example.csv
drwxrwxrwx. 2 50 nogroup      0 Jan 13 13:01 stocks_example_tab
drwxr-xr-x. 2 50 nogroup      0 Jan 13 13:07 stocks_high_volume.csv
drwxr-xr-x. 2 50 nogroup      0 Jan 13 12:41 stocks_prqt
drwxrwxrwx. 2 50 nogroup      0 Jan 13 12:40 stocks_tab
drwxrwxrwx. 2 50 nogroup      0 Jan 13 13:04 stocks_tab_by_spark


## Remove all files and tabels

In [7]:
# clean data
#!rm -rf $HOME/examples/stock*