In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
spark = SparkSession.builder \
                    .config("spark.jar", "C:\\Users\\thehu\\OneDrive\\Mia_town\\IT\\Data\\DE\\study_de\\postgresql-42.7.5.jar") \
                    .config("spark.driver.memory", "8g") \
                    .getOrCreate()

#DATABASE
host = 'localhost'
dbname = 'miatown'
user = 'postgres'
password = input('hay nhap pass')
port = '5432'
driver = "org.postgresql.Driver"

conn = psycopg2.connect(f'host= {host} \
                        dbname= {dbname} \
                        user= {user} \
                        password= {password} \
                        ')

#set commit automaticaly
conn.set_session(autocommit=True)
cur = conn.cursor()


#PATH
path = 'C:\\Users\\thehu\\OneDrive\\Mia_town\\IT\\Data\\MIATOWN\\raw\\'

### SOURCE CODE

In [2]:
# #CREATE TABLE
# conn = psycopg2.connect(f'host= {host} \
#                         dbname= {dbname} \
#                         user= {user} \
#                         password= {password} \
#                         ')

# #set commit automaticaly
# conn.set_session(autocommit=True)
# cur = conn.cursor()
def create_database():
    try:
        cur.execute("""
                    CREATE TABLE IF NOT EXISTS transaction(
                    table_id VARCHAR,
                    table_name VARCHAR,
                    trans_id VARCHAR,
                    amount_origin INT,
                    voucher_amount_paid INT,
                    total_amount INT,
                    trans_date DATE,
                    voucher_name VARCHAR,
                    customer_name VARCHAR,
                    customer_phone VARCHAR);
                    

                    """)
    except psycopg2.Error as e:
        print('error in creating table, table is existed')
    return print("Database work finished")


def df_combined(path):
    df_combined = None 
    print('----------  combine file  ----------')
    json_file = [f'{path}{file}' for file in os.listdir(path) if file.endswith('.json')]        #print a list of exactly json file from folder
    for file in json_file:
        print(f'----------  Selecting from {file}  ----------')
        df = spark.read.json(file)
        df = (
                df.withColumn('customer_name', col('extra_data.customer_name')) \
                    .withColumn('customer_phone', col('extra_data.customer_phone')) \
                    .withColumn('trans_id', col('sale_detail').getItem(0).getItem('tran_id')) \
                    .withColumn("trans_date", (from_unixtime(col("created_at") / 1000)).cast('timestamp') - expr("INTERVAL 7 HOURS")) #set timezone manually
        )
        df = df.select(
                    'table_id',
                    'table_name', 
                    'trans_id', 
                    'amount_origin',
                    'voucher_amount_paid', 
                    'total_amount', 
                    'trans_date', 
                    'voucher_name', 
                    'customer_name',
                    'customer_phone' )
        print(f'----------  Union {file}  ----------')
        df_combined = df if df_combined is None else df_combined.union(df)
    return df_combined


def load_data_to_database(new_data_trans):
    print('Connect to database')
    url = f"jdbc:postgresql://{host}:{port}/{dbname}"
    properties = {
        "user": f"{user}",
        "password": f"{password}",
        "driver": f"{driver}"
    }
    print('Completed')

    print("Checking database")
    query = "(SELECT COUNT(*) AS row_count FROM transaction) as temp"
    count_old_data_trans = spark.read.jdbc(url, query, properties=properties)
    row_count = count_old_data_trans.collect()[0]["row_count"]
    old_data_trans = spark.read.jdbc(url ,"(SELECT trans_id FROM transaction)", properties = properties)

    if row_count == 0:
        print("Table 'transaction' is empty. Appending new data...")
        # Append data
        new_data_trans.write.jdbc(url, "transaction", mode = 'append', properties = properties)
        print("Data successfully appended to 'transaction' table.")

    else:
        print(f"Table 'transaction' already contains {row_count} rows. Checking for new data.")
        new_data_trans.createOrReplaceTempView("new_data_trans")
        old_data_trans.createOrReplaceTempView("old_data_trans")
        checking = spark.sql( """
                SELECT trans_id from new_data_trans
                EXCEPT
                SELECT trans_id from old_data_trans
    """)
        if checking.count() == 0:
            print("No new data, finished process")
        else:
            print(f"There's {checking.count()} new data in transaction, appending...." )
            checking.write.jdbc(url, "transaction", mode = 'append', properties = properties )
            print('Completed')
    
    


def main_task(path):
    create_database()
    df = df_combined(path)
    result = load_data_to_database(df)
    print("Finished")
    

    return result




In [3]:
path = 'C:\\Users\\thehu\\OneDrive\\Mia_town\\IT\\Data\\MIATOWN\\raw\\'
main_task(path)


Database work finished
----------  combine file  ----------
----------  Selecting from C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\bida_t04_2025.json  ----------
----------  Union C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\bida_t04_2025.json  ----------
----------  Selecting from C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\gamingps_t03_2025.json  ----------
----------  Union C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\gamingps_t03_2025.json  ----------
----------  Selecting from C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\gamingps_t04_2025.json  ----------
----------  Union C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\gamingps_t04_2025.json  ----------
----------  Selecting from C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\kid_t04_2025.json  ----------
----------  Union C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\kid_t04_2025.json  ----------
Connect to database
Completed
Checking database
Table 'transaction' alread

In [4]:
# url = f"jdbc:postgresql://{host}:{port}/{dbname}"
# properties = {
#         "user": f"{user}",
#         "password": f"{password}",
#         "driver": f"{driver}"
#     }

# new_data_trans = df_combined(path)


# query = "(SELECT COUNT(*) AS row_count FROM transaction) as temp"
# count_old_data_trans = spark.read.jdbc(url, query, properties=properties)
# row_count = count_old_data_trans.collect()[0]["row_count"]
# old_data_trans = spark.read.jdbc(url ,"(SELECT trans_id FROM transaction)", properties = properties)

# if row_count == 0:
#     print("Table 'transaction' is empty. Appending new data...")
#        # Append data
#     new_data_trans.write.jdbc(url, "transaction", mode = 'append', properties = properties)
#     print("Data successfully appended to 'transaction' table.")

# else:
#     print(f"Table 'transaction' already contains {row_count} rows. Skipping append.")
#     new_data_trans.createOrReplaceTempView("new_data_trans")
#     old_data_trans.createOrReplaceTempView("old_data_trans")
#     checking = spark.sql( """
#             SELECT trans_id from new_data_trans
#             EXCEPT
#             SELECT trans_id from old_data_trans
# """)
#     if checking.count() == 0:
#         print("No new data, finished process")
#     else:
#         print("There's new data in transaction, appending...." )
#         checking.write.jdbc(url, "transaction", mode = 'append', properties = properties )
#         print('Completed')
          
