# Imported Library

In [58]:
import psycopg2
from psycopg2 import sql
import psycopg2.extras as extras
import pandas as pd
import json
from datetime import datetime,timezone
from dateutil import tz

from configupdater import ConfigUpdater
# pip install ConfigUpdater

from dotenv import dotenv_values

from google.cloud import bigquery
from google.cloud.exceptions import NotFound
from google.api_core.exceptions import BadRequest
from google.oauth2 import service_account

# Imported date

In [59]:
dt_imported=datetime.now(timezone.utc) # utc
dt_imported=datetime.strptime(dt_imported.strftime("%Y-%m-%d %H:%M:%S"),"%Y-%m-%d %H:%M:%S")
print(f"UTC: {dt_imported}")



UTC: 2023-12-29 06:08:40


# Set view

In [60]:
log = "models_logging_change"

view_name = "pmr_pm_plan"

if view_name == "pmr_pm_plan":
    content_id = 36
    view_name_id = "pm_id"

elif view_name == "pmr_pm_item":
    content_id = 37
    view_name_id = "pm_item_id"

elif view_name == "pmr_project":
    content_id = 7
    view_name_id = "project_id"

elif view_name == "pmr_inventory":
    content_id = 14
    view_name_id = "inventory_id"

else:
    raise Exception("No specified content type id")

# Set data and cofig path

In [61]:
projectId='kku-intern-dataai'  # smart-data-ml  or kku-intern-dataai
credential_file=r"C:\Windows\kku-intern-dataai-a5449aee8483.json"  # C:\Windows\smart-data-ml-91b6f6204773.json
credentials = service_account.Credentials.from_service_account_file(credential_file)

dataset_id='PMReport_Temp'  # 'SMartData_Temp'
table_name=view_name.replace("pmr_","temp_") #can change in ("name") to temp table
table_id = f"{projectId}.{dataset_id}.{table_name}"
print(table_id)

main_dataset_id='PMReport_Main'  # ='SMartDataAnalytics'
main_table_name=view_name.replace("pmr_","")
main_table_id = f"{projectId}.{main_dataset_id}.{main_table_name}"
print(main_table_id)

# https://cloud.google.com/bigquery/docs/reference/rest/v2/Job
to_bq_mode="WRITE_EMPTY"


client = bigquery.Client(credentials= credentials,project=projectId)


kku-intern-dataai.PMReport_Temp.temp_pm_plan
kku-intern-dataai.PMReport_Main.pm_plan


Read Configuration File and Initialize BQ Object

In [62]:
updater = ConfigUpdater()
updater.read(".cfg")

env_path='.env'
config = dotenv_values(dotenv_path=env_path)

In [63]:
last_imported=datetime.strptime(updater["metadata"][view_name].value,"%Y-%m-%d %H:%M:%S")
print(f"UTC:{last_imported}")

# local_zone = tz.tzlocal()
# last_imported = last_imported.astimezone(local_zone)
# print(f"Local Asia/Bangkok:{last_imported}")

UTC:2023-12-29 06:01:02


# Postgres &BigQuery

In [64]:
def get_postgres_conn():
 try:
  conn = psycopg2.connect(
        database=config['DATABASES_NAME'], user=config['DATABASES_USER'],
      password=config['DATABASES_PASSWORD'], host=config['DATABASES_HOST']
     )
  return conn

 except Exception as error:
  print(error)      
  raise error
def list_data(sql,params,connection):
 df=None   
 with connection.cursor() as cursor:
    
    if params is None:
       cursor.execute(sql)
    else:
       cursor.execute(sql,params)
    
    columns = [col[0] for col in cursor.description]
    dataList = [dict(zip(columns, row)) for row in cursor.fetchall()]
    df = pd.DataFrame(data=dataList) 
 return df 

In [65]:
def get_bq_table():
 try:
    table=client.get_table(table_id)  # Make an API request.
    print("Table {} already exists.".format(table_id))
    print(table.schema)
    return True
 except NotFound:
    raise Exception("Table {} is not found.".format(table_id))
    
def collectBQError(x_job):
 if x_job.errors is not None:
    for error in x_job.errors:  
      msg=f"{error['reason']} - {error['message']}"
      listError.append([datetime.now().strftime("%Y-%m-%d %H:%M:%S"),dtStr_imported,source_name,msg])
    if   len(listError)>0:
     logErrorMessage(listError,False)  

    
def insertDataFrameToBQ(df_trasns):
    try:
        job_config = bigquery.LoadJobConfig(write_disposition=to_bq_mode,)
        job = client.load_table_from_dataframe(df_trasns, table_id, job_config=job_config)
        try:
         job.result()  # Wait for the job to complete.
        except ClientError as e:
         print(job.errors)

        print("Total ", len(df_trasns), f"Imported data to {table_id} on bigquery successfully")

    except BadRequest as e:
        print("Bigquery Error\n")
        print(e) 

# Load log table

In [66]:
sql_log = f"""
SELECT object_id, action,TO_CHAR(date_created,'YYYY-MM-DD HH24:MI:SS') as date_created FROM {log}
WHERE date_created  AT time zone 'utc' >= '{last_imported}' AND content_type_id = {content_id} ORDER BY object_id, date_created
"""
print(sql_log)


# Asia/Bangkok 



lf = list_data(sql_log, None, get_postgres_conn())
print(f"Retrieve all rows after {last_imported}")
print(lf.info())

if lf.empty==True:
    print("No row to be imported.")
    #exit()
lf


SELECT object_id, action,TO_CHAR(date_created,'YYYY-MM-DD HH24:MI:SS') as date_created FROM models_logging_change
WHERE date_created  AT time zone 'utc' >= '2023-12-29 06:01:02' AND content_type_id = 36 ORDER BY object_id, date_created

Retrieve all rows after 2023-12-29 06:01:02
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   object_id     1 non-null      object
 1   action        1 non-null      object
 2   date_created  1 non-null      object
dtypes: object(3)
memory usage: 152.0+ bytes
None


Unnamed: 0,object_id,action,date_created
0,44,added,2023-12-29 13:07:55


# Get all actions from log table by selecting unique object_id and setting by doing something as logic 

In [67]:
listIDs=lf["object_id"].unique().tolist()
listUpdateData=[]
for id in listIDs:
    lfTemp=lf.query("object_id==@id")
    # print(lfTemp)
    # print("----------------------------------------------------------------")
    
    
    first_row = lfTemp.iloc[0]
    last_row = lfTemp.iloc[-1]
    # print(first_row)
    # print(last_row)

    if len(lfTemp)==1:
        listUpdateData.append([id,first_row["action"]])
    else:
        if first_row["action"] == "added" and last_row["action"] == "deleted":
            continue
        elif first_row["action"] == "added" and last_row["action"] != "deleted":
            listUpdateData.append([id,"added"])
        else : listUpdateData.append([id,last_row["action"]])
# print(listUpdateData)



# Create  id and action dataframe form filtered rows from log table

In [68]:
print("Convert listUpdate to dataframe")
dfUpdateData = pd.DataFrame(listUpdateData, columns= ['id', 'action'])
dfUpdateData['id'] = dfUpdateData['id'].astype('int64')
print(dfUpdateData.info())


dfUpdateData

Convert listUpdate to dataframe
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1 non-null      int64 
 1   action  1 non-null      object
dtypes: int64(1), object(1)
memory usage: 144.0+ bytes
None


Unnamed: 0,id,action
0,44,added


In [69]:
print("iI the main table is empty , so the action of each row  must be 'added' on temp table")

rows_iter   = client.list_rows(main_table_id, max_results=1) 
if(len(list(rows_iter))==0):
   print(f"No data in {main_table_id}, so all rows in {table_id} action will be 'added' except 'deleted'")
   dfUpdateData=dfUpdateData.query("action!='deleted'")
   dfUpdateData['action']='added'


dfUpdateData=dfUpdateData.sort_values(by="id")
dfUpdateData=dfUpdateData.reset_index(drop=True)

print(dfUpdateData.info())
# Display the DataFrame

dfUpdateData

iI the main table is empty , so the action of each row  must be 'added' on temp table
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1 non-null      int64 
 1   action  1 non-null      object
dtypes: int64(1), object(1)
memory usage: 144.0+ bytes
None


Unnamed: 0,id,action
0,44,added


# Load view and transform

In [70]:
listUpdateIDs = dfUpdateData['id'].tolist()
print(listUpdateIDs)
if len(listUpdateIDs)>1:
 sql_view=f"select *  from {view_name}  where {view_name_id} in {tuple(listUpdateIDs)}"
else:
 sql_view=f"select *  from {view_name}  where {view_name_id} ={listUpdateIDs[0]}"
print(sql_view)
df=list_data(sql_view,None,get_postgres_conn())
print(df.info())
df

[44]
select *  from pmr_pm_plan  where pm_id =44
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pm_id          1 non-null      int64 
 1   project_id     1 non-null      int64 
 2   planned_date   1 non-null      object
 3   ended_pm_date  1 non-null      object
 4   pm_period      1 non-null      object
 5   team_lead      1 non-null      object
dtypes: int64(2), object(4)
memory usage: 176.0+ bytes
None


Unnamed: 0,pm_id,project_id,planned_date,ended_pm_date,pm_period,team_lead
0,44,26,2023-12-01,2023-12-31,1/2,Jarin Utong


# Merge LogDF and ViewDF

In [71]:
merged_df = pd.merge(df, dfUpdateData, left_on=view_name_id, right_on='id', how='inner')
merged_df = merged_df.drop(columns=['id'])
merged_df

Unnamed: 0,pm_id,project_id,planned_date,ended_pm_date,pm_period,team_lead,action
0,44,26,2023-12-01,2023-12-31,1/2,Jarin Utong,added


# Get Deleted Items  to Create deleted dataframe by using listDeleted
## if there is one deletd row then  we will merge it to master dataframe

In [72]:
listSelected = df[view_name_id].tolist()
print(listSelected)

set1 = set(listUpdateIDs)
set2 = set(listSelected)
listDeleted = list(set1.symmetric_difference(set2))

print(listDeleted)

if len(listDeleted)>0:
    print("There are some deleted rows")
    dfDeleted=pd.DataFrame(data=listDeleted,columns=[view_name_id])
    dfDeleted['action']='deleted'
    print(dfDeleted)
    
    merged_df=pd.concat([merged_df,dfDeleted],axis=0)

    
else:
    print("No row deleted")
    


[44]
[]
No row deleted


# Check duplicate ID

In [73]:
hasDplicateIDs = merged_df[view_name_id].duplicated().any()
if  hasDplicateIDs:
 raise Exception("There are some duplicate id on dfUpdateData")
else:
 print(f"There is no duplicate {view_name_id} ID")   

# merged_df['imported_at']=dt_imported
merged_df=merged_df.reset_index(drop=True  )
merged_df

There is no duplicate pm_id ID


Unnamed: 0,pm_id,project_id,planned_date,ended_pm_date,pm_period,team_lead,action
0,44,26,2023-12-01,2023-12-31,1/2,Jarin Utong,added


# Insert data to BQ data frame

In [74]:
if get_bq_table():
    try:
        insertDataFrameToBQ(merged_df)
    except Exception as ex:
        raise ex

Table kku-intern-dataai.PMReport_Temp.temp_pm_plan already exists.
[SchemaField('pm_id', 'INTEGER', 'NULLABLE', None, None, (), None), SchemaField('project_id', 'INTEGER', 'NULLABLE', None, None, (), None), SchemaField('planned_date', 'DATE', 'NULLABLE', None, None, (), None), SchemaField('ended_pm_date', 'DATE', 'NULLABLE', None, None, (), None), SchemaField('pm_period', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('team_lead', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('action', 'STRING', 'NULLABLE', None, None, (), None)]
Total  1 Imported data to kku-intern-dataai.PMReport_Temp.temp_pm_plan on bigquery successfully


In [75]:
updater["metadata"][view_name].value=dt_imported.strftime("%Y-%m-%d %H:%M:%S")
updater.update_file() 

<ConfigUpdater [
    <Section: 'metadata' [
        <Option: pmr_pm_plan = '2023-12-29 06:08:40'>
        <Option: pmr_pm_item = '2023-12-01 00:00:05'>
        <Option: pmr_project = '2023-12-01 00:00:05'>
        <Option: pmr_inventory = '2023-12-01 00:00:05'>
    ]>
]>

In [76]:
print(datetime.now(timezone.utc) )

2023-12-29 06:08:48.049419+00:00
