In [2]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate()
import psycopg2

from sqlalchemy import create_engine

#DATABASE
host = 'localhost'
dbname = 'miatown'
user = 'postgres'
password = input('hay nhap pass')
port = '5432'


#PATH
path = 'C:\\Users\\thehu\\OneDrive\\Mia_town\\IT\\Data\\MIATOWN\\raw\\'

### Create/ Update Database

In [3]:
#CREATE TABLE
conn = psycopg2.connect(f'host= {host} \
                        dbname= {dbname} \
                        user= {user} \
                        password= {password} \
                        ')

#set commit automaticaly
conn.set_session(autocommit=True)
cur = conn.cursor()

try:
    cur.execute("""
                CREATE TABLE IF NOT EXISTS transaction(
                table_id VARCHAR,
                table_name VARCHAR,
                trans_id VARCHAR,
                amount_origin INT,
                voucher_amount_paid INT,
                total_amount INT,
                trans_date DATE,
                voucher_name VARCHAR,
                customer_name VARCHAR,
                customer_phone VARCHAR);
                

                """)
except psycopg2.Error as e:
    print('error in creating table')



def load_data_to_database(transaction):
    ## IMPORT DATA TO DATABASE
    engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{dbname}')
    
    #get data from database to check 
    transaction_pd = pd.read_sql_table('transaction', engine)
    transaction_ps = spark.createDataFrame(transaction_pd)
    
    exists = transaction_ps.filter()





    #import new invoice to sql
    new_transaction = transaction[~transaction['ma_hoa_don'].isin(transaction_ps['ma_hoa_don'])]
    new_transaction.to_sql('transaction', engine, if_exists='append', index= False)







### Transform Data

In [4]:
def df_combined(path):
    df_combined = None 
    print('----------  combine file  ----------')
    json_file = [f'{path}{file}' for file in os.listdir(path) if file.endswith('.json')]        #print a list of exactly json file from folder
    for file in json_file:
        print(f'----------  Selecting from {file}  ----------')
        df = spark.read.json(file)
        df = (
                df.withColumn('customer_name', col('extra_data.customer_name')) \
                    .withColumn('customer_phone', col('extra_data.customer_phone')) \
                    .withColumn('trans_id', col('sale_detail').getItem(0).getItem('tran_id')) \
                    .withColumn("trans_date", from_unixtime(col("created_at") / 1000))
        )
        df = df.select(
                    'table_id',
                    'table_name', 
                    'trans_id', 
                    'amount_origin',
                    'voucher_amount_paid', 
                    'total_amount', 
                    'trans_date', 
                    'voucher_name', 
                    'customer_name',
                    'customer_phone' )
        print(f'----------  Union {file}  ----------')
        df_combined = df if df_combined is None else df_combined.union(df)
    return df_combined

In [5]:
transaction = df_combined(path)

----------  combine file  ----------
----------  Selecting from C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\bida_t04_2025.json  ----------
----------  Union C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\bida_t04_2025.json  ----------
----------  Selecting from C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\gamingps_t04_2025.json  ----------
----------  Union C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\gamingps_t04_2025.json  ----------
----------  Selecting from C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\kid_t04_2025.json  ----------
----------  Union C:\Users\thehu\OneDrive\Mia_town\IT\Data\MIATOWN\raw\kid_t04_2025.json  ----------


In [6]:
transaction.show()

+----------+-------------------+--------------------+-------------+-------------------+------------+-------------------+------------------+------------------+--------------+
|  table_id|         table_name|            trans_id|amount_origin|voucher_amount_paid|total_amount|         trans_date|      voucher_name|     customer_name|customer_phone|
+----------+-------------------+--------------------+-------------+-------------------+------------+-------------------+------------------+------------------+--------------+
|TABLE-FS07| Bàn 24 - BIDA BĂNG|P63JDN7MGD2P1ZLF2...|     383211.0|                0.0|    383211.0|2025-05-01 06:46:44|                  |     Phạm Hoàng Ân|   84985678036|
|TABLE-FS07| Bàn 24 - BIDA BĂNG|P63JDN7MGD2P75QPB...|     168674.0|                0.0|    168674.0|2025-05-01 06:46:41|                  |     Phạm Hoàng Ân|   84985678036|
|TABLE-5PKG|Bàn 20 - BIDA LIBRE|P63JDN7MGD2P1ZLF8...|     101792.0|                0.0|    101792.0|2025-05-01 06:42:05|          

In [None]:
# spark.read.format('jbdc').options(conn)


TypeError: DataFrameReader.options() takes 1 positional argument but 2 were given

In [8]:
transaction.createOrReplaceTempView("transaction_table")


In [10]:
spark.sql('select trans_id from transaction_table').show()

+--------------------+
|            trans_id|
+--------------------+
|P63JDN7MGD2P1ZLF2...|
|P63JDN7MGD2P75QPB...|
|P63JDN7MGD2P1ZLF8...|
|4PW46ZJ6VPRKJVY8U...|
|P63JDN7MGD2P1ZLF2...|
|4PW46ZJ6VPRK5IVI2...|
|4PW46ZJ6VPRK5IVI9...|
|P63JDN7MGD2P1ZLF5...|
|4PW46ZJ6VPRK5IVI3...|
|4PW46ZJ6VPRK5IVIA...|
|4PW46ZJ6VPRK5IVI0...|
|4PW46ZJ6VPRKJVY79...|
|4PW46ZJ6VPRK5IVI1...|
|4PW46ZJ6VPRK5IVHU...|
|4PW46ZJ6VPRKJVY80...|
|4PW46ZJ6VPRK5IVHU...|
|P63JDN7MGD2P1ZLF2...|
|4PW46ZJ6VPRK5IVHV...|
|4PW46ZJ6VPRK1ZLF3...|
|4PW46ZJ6VPRK5IVHL...|
+--------------------+
only showing top 20 rows

