# Applied Project in Big Data on Industrial Dataset

## DATA COLLECTION TECHNIQUES
## Part V. Load from object storage and preprocessing (options project)

### 1. Libraries and credentials

[About boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) library.

In [None]:
import os
import sys
import json
import boto3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [None]:
def access_data(file_path):
    with open(file_path) as file:
        access_data = json.load(file)
    return access_data

creds = access_data(file_path='access_bucket.json')
print(creds.keys())

### 2. Session and client for loading

In [None]:
session = boto3.session.Session()
s3 = session.client(
    service_name='s3',
    aws_access_key_id=creds['aws_access_key_id'],
    aws_secret_access_key=creds['aws_secret_access_key'],
    endpoint_url='https://storage.yandexcloud.net'
)

In [None]:
OPTS_DATA_BUCKET = 'apid-data-options'

In [None]:
all_files = [key['Key'] for key in s3.list_objects(Bucket=OPTS_DATA_BUCKET)['Contents']]
print('files in storage:', all_files[:10]) # works only for num of files < 1000

### 3. Spark rulez

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import udf, struct, count_distinct, from_unixtime

In [None]:
conf = SparkConf()
conf.set('spark.master', 'local[5]')
conf.set('spark.executor.memory', '4G')
conf.set('spark.driver.memory', '4G')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
spark._jsc.hadoopConfiguration().set('fs.s3a.access.key', creds['aws_access_key_id'])
spark._jsc.hadoopConfiguration().set('fs.s3a.secret.key', creds['aws_secret_access_key'])
spark._jsc.hadoopConfiguration().set('fs.s3a.impl','org.apache.hadoop.fs.s3a.S3AFileSystem')
spark._jsc.hadoopConfiguration().set('fs.s3a.multipart.size', '104857600')
spark._jsc.hadoopConfiguration().set('fs.s3a.block.size', '33554432')
spark._jsc.hadoopConfiguration().set('fs.s3a.threads.max', '256')
spark._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 'http://storage.yandexcloud.net')

In [None]:
file_path = file_path = f's3a://{OPTS_DATA_BUCKET}/' + 'data/L3_options_20161101.parquet'
sdf = spark.read.parquet(file_path)

In [None]:
sdf.show()

In [None]:
sdf.limit(10).toPandas()

### 5. Data processing

In [None]:
sdf.groupBy("type").count().show()

In [None]:
sdf.filter(sdf.base_symbol == 'FTNT').count()

In [None]:
sdf \
    .filter(sdf.base_symbol == 'FTNT') \
    .limit(5) \
    .toPandas()

In [None]:
sdf_short = sdf.select(
    sdf.date,
    sdf.base_symbol,
    sdf.base_price,
    sdf.type
).dropDuplicates()

sdf_short.limit(5).toPandas()