In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType
from pyspark.sql.functions import to_date, col


# Removing hard coded password - using os module & open to import them from creds.txt file
import os
import sys

try:
    creds_file = (open(f"/home/{os.getenv('USER')}/creds.txt", "r")).read().strip().split(",")
    accesskey,secretkey = creds_file[0],creds_file[1]
except:
    print("File not found, you can't access minio")
    accesskey,secretkey = "",""


conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.2.3,mysql:mysql-connector-java:8.0.33') 

conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')

conf.set('spark.hadoop.fs.s3a.access.key', accesskey)
conf.set('spark.hadoop.fs.s3a.secret.key', secretkey)
# Configure these settings
# https://medium.com/@dineshvarma.guduru/reading-and-writing-data-from-to-minio-using-spark-8371aefa96d2
conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
# https://github.com/minio/training/blob/main/spark/taxi-data-writes.py
# https://spot.io/blog/improve-apache-spark-performance-with-the-s3-magic-committer/
conf.set('spark.hadoop.fs.s3a.committer.magic.enabled','true')
conf.set('spark.hadoop.fs.s3a.committer.name','magic')
# Internal IP for S3 cluster proxy
conf.set("spark.hadoop.fs.s3a.endpoint", "http://system54.rice.iit.edu")
# Send jobs to the Spark Cluster
conf.setMaster("spark://sm.service.consul:7077")

#conf.set("spark.dynamicAllocation.enabled","true")
#conf.set("spark.dynamicAllocation.shuffleTracking.enabled","true")
conf.set("spark.driver.memory", "8g")  
conf.set("spark.executor.memory", "4g")
conf.set("spark.cores.max",'10')
conf.set('spark.executor.cores','1')
spark = SparkSession.builder.appName("miniotosql_50parquet")\
    .config('spark.driver.host','spark-edge.service.consul').config(conf=conf).getOrCreate()
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
spark.conf.set("spark.sql.debug.maxToStringFields", "10000")

In [None]:
# File paths
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, DateType
minio_parquet_path = "s3a://svuppu/hubtest1_50.parquet"
db_creds_path = "/home/svuppu/database-creds.txt"


try:
    creds_file = (open(f"/home/svuppu/mysql_creds.txt", "r")).read().strip().split(",")
    dbuser,dbpass = creds_file[0],creds_file[1]
except:
    print("File not found, you can't access minio")
    dbuser,dbpass = "",""

# Read database credentials

# Database connection details
# retrieve the below url from creds_file
# db_url = f"jdbc:mysql://jrh-521-database-vm0.service.consul:3306/svuppu"
db_url = ""
# Read CSV from MinIO
df = spark.read \
    .option("header", "true") \
    .parquet(minio_parquet_path)





# Writing to SQL with the schema
df.write \
    .format("jdbc") \
    .option("url", db_url) \
    .option("dbtable", "fifties") \
    .option("user", dbuser) \
    .option("password", dbpass) \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("truncate",'true') \
    .mode('overwrite') \
    .save()




print("Data successfully written to MySQL table!")
spark.stop()