# Loading CSV file from Amazon S3 into iguazio file system or database

In [11]:
import pandas as pd
import sys
sys.path.append('../')
import v3io_frames_local as v3f
import os
client = v3f.Client('framesd:8081', container='users')

## Import sample file from S3 into iguazio file system (v3io)

In [12]:
%%sh
mkdir -p /v3io/${V3IO_HOME}/examples

# Download a sample stocks file from Iguazio demo bucket in S3
curl -L "iguazio-sample-data.s3.amazonaws.com/2018-03-26_BINS_XETR08.csv" > /v3io/${V3IO_HOME}/examples/stocks_example.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  861k  100  861k    0     0   716k      0  0:00:01  0:00:01 --:--:--  716k


## Read the file using into a pandas DataFrame
Note the file can be read directly from HTTP into a DataFrame (if placing the full URL i.e. `pd.read_csv('http://deutsche-boerse...')`

In [13]:
# read a csv file into a data frame 
df = pd.read_csv(os.path.join('/v3io/users/'+os.getenv('V3IO_USERNAME')+'/examples/stocks_example.csv'))

df.set_index('ISIN', inplace=True)
df.head()

Unnamed: 0_level_0,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades
ISIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
CH0038389992,BBZA,"BB BIOTECH NAM. SF 0,20",Common stock,EUR,2504244,2018-03-26,08:00,56.4,56.4,56.4,56.4,320,4
CH0038863350,NESR,"NESTLE NAM. SF-,10",Common stock,EUR,2504245,2018-03-26,08:00,63.04,63.06,63.0,63.06,314,3
LU0378438732,C001,COMSTAGE-DAX UCITS ETF I,ETF,EUR,2504271,2018-03-26,08:00,113.42,113.42,113.42,113.42,100,1
LU0411075020,DBPD,XTR.SHORTDAX X2 DA.SW. 1C,ETF,EUR,2504272,2018-03-26,08:00,4.1335,4.1335,4.1295,4.13,102993,8
LU0838782315,XDDX,XTR.DAX INCOME 1D,ETF,EUR,2504277,2018-03-26,08:00,105.14,105.2,105.14,105.2,239,3


## Write file into iguazio database as key value table using v3io frames

In [14]:
tablename = os.path.join(os.getenv('V3IO_USERNAME')+'/stocks_example_tab')
client.delete('kv', tablename)
client.write('kv', tablename, df)

## Read and write the file using Spark DF

In [15]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Iguazio file access notebook").getOrCreate()

file_path=os.path.join(os.getenv('V3IO_HOME_URL')+'/examples')

# Read the sample stocks.csv file into a Spark DataFrame, and let Spark infer the schema of the CSV file
df = spark.read.option("header", "true").csv(os.path.join(file_path)+'/stocks_example.csv')

# Show the DataFrame data
df.show()

# Write the DataFrame data to a stocks_tab table under "users" container and define "ISIN" column as a key
df.write.format("io.iguaz.v3io.spark.sql.kv").mode("append").option("key", "ISIN").option("allow-overwrite-schema", "true").save(os.path.join(file_path)+'/stocks_tab_spark/')

+------------+--------+--------------------+------------+--------+----------+----------+-----+----------+--------+--------+--------+------------+--------------+
|        ISIN|Mnemonic|        SecurityDesc|SecurityType|Currency|SecurityID|      Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|TradedVolume|NumberOfTrades|
+------------+--------+--------------------+------------+--------+----------+----------+-----+----------+--------+--------+--------+------------+--------------+
|CH0038389992|    BBZA|BB BIOTECH NAM.  ...|Common stock|     EUR|   2504244|2018-03-26|08:00|      56.4|    56.4|    56.4|    56.4|         320|             4|
|CH0038863350|    NESR|NESTLE NAM.      ...|Common stock|     EUR|   2504245|2018-03-26|08:00|     63.04|   63.06|      63|   63.06|         314|             3|
|LU0378438732|    C001|COMSTAGE-DAX UCIT...|         ETF|     EUR|   2504271|2018-03-26|08:00|    113.42|  113.42|  113.42|  113.42|         100|             1|
|LU0411075020|    DBPD|XTR.SHORTDA

## Read iguazio table and writing it back as a CSV 

In [16]:
#myDF2 = spark.read.format("io.iguaz.v3io.spark.sql.kv").load("v3io://users/iguazio/examples/stocks_tab_by_spark").where("TradedVolume>20000")
myDF2 = spark.read.format("io.iguaz.v3io.spark.sql.kv").load(os.path.join(file_path)+'/stocks_tab_spark').where("TradedVolume>20000")

# myDF2.write.csv('v3io://bigdata/examples/stocks_high_volume.csv')
myDF2.coalesce(1).write.mode('overwrite').csv(os.path.join(file_path)+'/stocks_high_volume.csv')

# note that using coalesce(1) is for storing the output as a single file


## Viewing files 
Note: the table will apear as a directory under v3io file system

In [17]:
!ls -l /v3io/${V3IO_HOME}/examples/

total 0
-rw-r--r-- 1 root nogroup 882055 Jan 10 19:12 stocks_example.csv
drwxr-xr-x 2 root nogroup      0 Jan 10 19:12 stocks_high_volume.csv
drwxrwxrwx 2 root nogroup      0 Jan 10 19:12 stocks_tab_spark


## Remove all files and tables

In [18]:
# clean data
!rm -rf /v3io/${V3IO_HOME}/examples/*

In order to release compute and memory resources taken by spark we recommend running the following command 

In [19]:
spark.stop()