In [None]:
import os
import sys
import json
import numpy as np
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import udf, struct, countDistinct
import multiprocessing

pd.set_option('display.max_columns', None)

In [None]:
def access_data(file_path):
    access_data = {}
    try:
        with open(file_path) as file:
            access_data = json.load(file)
        print(f'credentials from file {file_path} - loaded')
    except Exception as e:
        print(f'credentials from file {file_path} - {e}')
    return access_data

access_s3_data = access_data('/home/jovyan/dataload/access_s3.json')
print(access_s3_data)

In [None]:
conf = SparkConf()
conf.set('spark.master', 'local[8]')
conf.set('spark.driver.memory', '16G')
conf.set('spark.driver.maxResultSize', '2G')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
spark._jsc.hadoopConfiguration().set('fs.s3a.access.key', access_s3_data['aws_access_key_id'])
spark._jsc.hadoopConfiguration().set('fs.s3a.secret.key', access_s3_data['aws_secret_access_key'])
spark._jsc.hadoopConfiguration().set('fs.s3a.impl','org.apache.hadoop.fs.s3a.S3AFileSystem')
spark._jsc.hadoopConfiguration().set('fs.s3a.multipart.size', '104857600')
spark._jsc.hadoopConfiguration().set('fs.s3a.block.size', '33554432')
spark._jsc.hadoopConfiguration().set('fs.s3a.threads.max', '256')
spark._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 'http://storage.yandexcloud.net')
spark._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 
                                     'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
spark

In [None]:
BUCKET = 'rawdata-test-logs-jhub'

In [None]:
%%time

mask_files = '{}/{}/{}/{}/{}/*/*'.format(
    'fluent-bit-logs/kube.var.log.containers.hub*',
    '2023',    # year
    '*',      # month
    '*',       # day
    '*'        # hour
)
sdf = spark.read.json(f's3a://{BUCKET}/{mask_files}')
sdf.count()

In [None]:
sdf.limit(5).toPandas()

In [None]:
sdf.printSchema()

In [None]:
df = sdf.toPandas()

In [None]:
df.info()

In [None]:
df.to_csv('jhub_logs.csv', sep=';', index=False)