## Config connection to Apache Spark local

In [1]:
import findspark
findspark.init(spark_home = "/home/thanhphat/BigData/spark-3.5.0-bin-hadoop3")

In [2]:
from pyspark.sql import SparkSession

import traceback
import pyspark.sql.functions as f
import pyspark.sql.types as t

In [3]:
project_name = "Global_Electronics_Retailer"

In [4]:
# Config
    # Threads: 4 CPU
    # Dirver: 1
    # Number of executor: 3
    # 1 CPU for each executor
    # 1g memory for each executor


# Create SparkSession
spark = SparkSession.builder.master("local[4]") \
                            .appName("Source_to_Bronze") \
                            .config("spark.sql.parquet.vorder.enabled", "true") \
                            .config("spark.sql.shuffle.partitions", 100) \
                            .config("spark.driver.memory", "1g") \
                            .config("spark.executor.instances", "3") \
                            .config("spark.executor.cores", "1") \
                            .config("spark.executor.memory", "1g") \
                            .config("spark.jars", "../driver/mysql-connector-j-8.1.0.jar") \
                            .getOrCreate()

spark.sparkContext.getConf().getAll()

24/06/28 22:42:50 WARN Utils: Your hostname, thanhphat-inspiron-5406-2n1 resolves to a loopback address: 127.0.1.1; using 192.168.1.8 instead (on interface wlp0s20f3)
24/06/28 22:42:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/06/28 22:42:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


[('spark.driver.port', '41959'),
 ('spark.executor.memory', '2g'),
 ('spark.master', 'local[4]'),
 ('spark.driver.host', '192.168.1.8'),
 ('spark.app.startTime', '1719589371736'),
 ('spark.app.submitTime', '1719589371545'),
 ('spark.executor.id', 'driver'),
 ('spark.executor.cores', '2'),
 ('spark.jars', '../driver/mysql-connector-j-8.1.0.jar'),
 ('spark.sql.shuffle.partitions', '100'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base

In [5]:
import sys
sys.path.append("/home/thanhphat/PersonalProject/Global_Electronics_Retailer/source")

from modules.Extraction import *
from modules.HDFSUtils import *
from modules.LogUtils import *
from modules.Metadata import *


# Instance for modules
extraction = Extraction()
hdfsUtils = HDFSUtils()
logUtils = LogUtils() 
metadata = Metadata()

# Define base_path
file_path = f"hdfs://localhost:9000/lakehouse/LH_{project_name}/Files/Bronze"
log_path = f"hdfs://localhost:9000/lakehouse/LH_{project_name}/Files/log"

In [6]:
executionDate = str(spark.sql("SELECT CURRENT_DATE()").collect()[0][0])

# Partition Execution Date
parse_execution = executionDate.split("-")
year = parse_execution[0]
month = parse_execution[1]
day = parse_execution[2]

                                                                                

In [7]:
# # Define parameter for connect to MySQL
# database = "Global_Electronics_Retailer"
# dbname = f"jdbc:mysql://localhost:3306/{database}"
# driver = "com.mysql.jdbc.Driver"
# username = "root"
# password = "password"

# df = extraction.read_table_mysql(spark, driver, dbname, "customers", username, password)

# display(df)

## Metadata Table Action

In [8]:
# Read metadata action
# metadata_action = metadata.read_metadata_action("admin", "admin", "metadata", "config_table", \
#                                                 "CusDB -> Bronze")

from airflow.models import Variable
metadata_action = Variable.get(key = "metadata_action", deserialize_json=True, default_var=None)

# Define for log job
batch_run = hdfsUtils.check_batch_run(project_name, executionDate)
start_time = ""
end_time = ""
error = ""
status = ""
source_row_read = 0
numInserted = 0
numUpdated = 0


# Define parameter for connect to MySQL
database = "Global_Electronics_Retailer"
dbname = f"jdbc:mysql://localhost:3306/{database}"
driver = "com.mysql.jdbc.Driver"
username = "root"
password = "password"


# tblNames
tblNames = ["customers", "sales", "products", "stores", "exchange_rates"]


# # Read all table
for metadata in metadata_action:

    task_id = metadata["task_id"]
    task_name = metadata["task_name"]
    source_connection = metadata["source_connection"]
    source_database = metadata["source_database"]
    source_table = metadata["source_table"].lower()
    phase = metadata["phase"]

    # Start time for check
    start_time = spark.sql(''' SELECT CURRENT_TIMESTAMP() as current_time ''') \
                        .collect()[0]["current_time"].strftime('%Y-%m-%d %H:%M:%S')
    try:
        # Read data
        df = extraction.read_table_mysql(spark, driver, dbname, source_table, username, password)
        

        # Validate data
        source_row_read = df.count()
        numInserted = df.count()

        # Create new column for partition
        df = extraction.create_year_month_day(df, executionDate, f)
        
        # Display df
        # df.show()

        # Write data to HDFS
        code = hdfsUtils.check_exist_data(executionDate, project_name, source_table)
        # Exist file
        if code == 0: # Yes => Append for version data
            df.write.mode("append").format("parquet") \
                    .save(f"{file_path}/{source_table}/year={year}/month={month}/day={day}/{source_table}_{year}_{month}_{day}-version_{batch_run}.parquet")
        else: # No => First run
            df.write.mode("overwrite").format("parquet") \
                    .save(f"{file_path}/{source_table}/year={year}/month={month}/day={day}/{source_table}_{year}_{month}_{day}-version_{batch_run}.parquet")
    
    except:
        error = traceback.format_exc()
        status = "Failed"

        print("Task ", task_id, " ", status)

    else:
        error = ""
        status = "Success"
        print("Task ", task_id, " ", status)
    
    # End time for check
    end_time = spark.sql(''' SELECT CURRENT_TIMESTAMP() as current_time ''') \
                        .collect()[0]["current_time"].strftime('%Y-%m-%d %H:%M:%S')

    # Check status
    # print("Tablename: ", tblName, "Error: ", error, "Status: ", status, 
    #       "Source rows: ", source_row_read, "Num of rows Inserted: ", numInserted)


    df_log = logUtils.log_data(batch_run, task_name, source_connection, source_database, source_table, "parquet",
                 start_time, end_time, source_row_read, numInserted, numUpdated, "", 
                 "", error, status, phase, t, spark)

    df_log.write.mode("append").format("parquet").save(f"{log_path}/{executionDate}/batch_{batch_run}/")

Connected successfully!!!


ls: Call From thanhphat-Inspiron-5406-2n1/127.0.1.1 to localhost:9000 failed on connection exception: java.net.ConnectException: Connection refused; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused
Loading class `com.mysql.jdbc.Driver'. This is deprecated. The new driver class is `com.mysql.cj.jdbc.Driver'. The driver is automatically registered via the SPI and manual loading of the driver class is generally unnecessary.


Task  1   Failed


Py4JJavaError: An error occurred while calling o163.save.
: java.net.ConnectException: Call From thanhphat-inspiron-5406-2n1/127.0.1.1 to localhost:9000 failed on connection exception: java.net.ConnectException: Connection refused; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
	at org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:913)
	at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:828)
	at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1616)
	at org.apache.hadoop.ipc.Client.call(Client.java:1558)
	at org.apache.hadoop.ipc.Client.call(Client.java:1455)
	at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:242)
	at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:129)
	at com.sun.proxy.$Proxy38.mkdirs(Unknown Source)
	at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.mkdirs(ClientNamenodeProtocolTranslatorPB.java:674)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)
	at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)
	at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)
	at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)
	at com.sun.proxy.$Proxy39.mkdirs(Unknown Source)
	at org.apache.hadoop.hdfs.DFSClient.primitiveMkdir(DFSClient.java:2507)
	at org.apache.hadoop.hdfs.DFSClient.mkdirs(DFSClient.java:2483)
	at org.apache.hadoop.hdfs.DistributedFileSystem$27.doCall(DistributedFileSystem.java:1485)
	at org.apache.hadoop.hdfs.DistributedFileSystem$27.doCall(DistributedFileSystem.java:1482)
	at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
	at org.apache.hadoop.hdfs.DistributedFileSystem.mkdirsInternal(DistributedFileSystem.java:1499)
	at org.apache.hadoop.hdfs.DistributedFileSystem.mkdirs(DistributedFileSystem.java:1474)
	at org.apache.hadoop.fs.FileSystem.mkdirs(FileSystem.java:2388)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:356)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:188)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:269)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:859)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:388)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:361)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:240)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.net.ConnectException: Connection refused
	at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
	at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:716)
	at org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:205)
	at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:586)
	at org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:711)
	at org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:833)
	at org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:414)
	at org.apache.hadoop.ipc.Client.getConnection(Client.java:1677)
	at org.apache.hadoop.ipc.Client.call(Client.java:1502)
	... 69 more
