# Imported Library

In [353]:
import psycopg2
from psycopg2 import sql
import psycopg2.extras as extras
import pandas as pd
import json
from datetime import datetime,timezone

from configupdater import ConfigUpdater
# pip install ConfigUpdater

from dotenv import dotenv_values

from google.cloud import bigquery
from google.cloud.exceptions import NotFound
from google.api_core.exceptions import BadRequest
from google.oauth2 import service_account

# Imported date

In [354]:
#dt_imported=datetime.now(timezone.utc) # utc
dt_imported=datetime.now()
dt_imported=datetime.strptime(dt_imported.strftime("%Y-%m-%d %H:%M:%S"),"%Y-%m-%d %H:%M:%S")
print(dt_imported)

2023-12-27 00:49:53


# Set view

In [355]:
log = "models_logging_change"

view_name = "pmr_pm_plan"

if view_name == "pmr_pm_plan":
    content_id = 36
    view_name_id = "pm_id"

elif view_name == "pmr_pm_item":
    content_id = 37
    view_name_id = "pm_item_id"

elif view_name == "pmr_project":
    content_id = 7
    view_name_id = "project_id"

elif view_name == "pmr_inventory":
    content_id = 14
    view_name_id = "inventory_id"

else:
    raise Exception("No specified content type id")

# Set data and cofig path

In [356]:
projectId='smart-data-ml'  # smart-data-ml  or kku-intern-dataai
dataset_id='PMReport_Temp'
table_name=view_name.replace("pmr_","temp_") #can change in ("name") to temp table
table_id = f"{projectId}.{dataset_id}.{table_name}"
print(table_id)

main_dataset_id='PMReport_Main'
main_table_name=view_name.replace("pmr_","")
main_table_id = f"{projectId}.{main_dataset_id}.{main_table_name}"
print(main_table_id)

# https://cloud.google.com/bigquery/docs/reference/rest/v2/Job
to_bq_mode="WRITE_EMPTY"

credentials = service_account.Credentials.from_service_account_file(r'C:\Windows\smart-data-ml-91b6f6204773.json')
client = bigquery.Client(credentials= credentials,project=projectId)
print(client)

smart-data-ml.PMReport_Temp.temp_pm_plan
smart-data-ml.PMReport_Main.pm_plan
<google.cloud.bigquery.client.Client object at 0x0000021671A18B20>


Read Configuration File and Initialize BQ Object

In [357]:
updater = ConfigUpdater()
updater.read(".cfg")

env_path='.env'
config = dotenv_values(dotenv_path=env_path)

In [358]:
last_imported=datetime.strptime(updater["metadata"][view_name].value,"%Y-%m-%d %H:%M:%S")
print(last_imported)

2023-12-01 00:05:00


# Postgres &BigQuery

In [359]:
def get_postgres_conn():
 try:
  conn = psycopg2.connect(
        database=config['DATABASES_NAME'], user=config['DATABASES_USER'],
      password=config['DATABASES_PASSWORD'], host=config['DATABASES_HOST']
     )
  return conn

 except Exception as error:
  print(error)      
  raise error
def list_data(sql,params,connection):
 df=None   
 with connection.cursor() as cursor:
    
    if params is None:
       cursor.execute(sql)
    else:
       cursor.execute(sql,params)
    
    columns = [col[0] for col in cursor.description]
    dataList = [dict(zip(columns, row)) for row in cursor.fetchall()]
    df = pd.DataFrame(data=dataList) 
 return df 

In [360]:
def get_bq_table():
 try:
    table=client.get_table(table_id)  # Make an API request.
    print("Table {} already exists.".format(table_id))
    print(table.schema)
    return True
 except NotFound:
    raise Exception("Table {} is not found.".format(table_id))
    
def collectBQError(x_job):
 if x_job.errors is not None:
    for error in x_job.errors:  
      msg=f"{error['reason']} - {error['message']}"
      listError.append([datetime.now().strftime("%Y-%m-%d %H:%M:%S"),dtStr_imported,source_name,msg])
    if   len(listError)>0:
     logErrorMessage(listError,False)  

    
def insertDataFrameToBQ(df_trasns):
    try:
        job_config = bigquery.LoadJobConfig(write_disposition=to_bq_mode,)
        job = client.load_table_from_dataframe(df_trasns, table_id, job_config=job_config)
        try:
         job.result()  # Wait for the job to complete.
        except ClientError as e:
         print(job.errors)

        print("Total ", len(df_trasns), f"Imported data to {table_id} on bigquery successfully")

    except BadRequest as e:
        print("Bigquery Error\n")
        print(e) 

# Load log table

In [361]:
sql_log = f"""
SELECT object_id, action,TO_CHAR(date_created,'YYYY-MM-DD HH24:MI:SS') as date_created FROM {log}
WHERE date_created >= '{last_imported}' AND content_type_id = {content_id} ORDER BY object_id, date_created
"""
print(sql_log)

lf = list_data(sql_log, None, get_postgres_conn())
print(f"Retrieve all rows after {last_imported}")
print(lf.info())

if lf.empty==True:
    print("No row to be imported.")
    #exit()
lf


SELECT object_id, action,TO_CHAR(date_created,'YYYY-MM-DD HH24:MI:SS') as date_created FROM models_logging_change
WHERE date_created >= '2023-12-01 00:05:00' AND content_type_id = 36 ORDER BY object_id, date_created

Retrieve all rows after 2023-12-01 00:05:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   object_id     56 non-null     object
 1   action        56 non-null     object
 2   date_created  56 non-null     object
dtypes: object(3)
memory usage: 1.4+ KB
None


Unnamed: 0,object_id,action,date_created
0,10,changed,2023-12-13 14:04:42
1,12,changed,2023-12-26 18:29:04
2,14,changed,2023-12-12 15:03:12
3,14,changed,2023-12-12 15:04:04
4,15,deleted,2023-12-12 15:03:28
5,16,deleted,2023-12-26 18:26:50
6,18,changed,2023-12-13 13:59:08
7,19,changed,2023-12-26 18:36:13
8,19,changed,2023-12-26 18:36:23
9,20,added,2023-12-12 14:49:17


# Get all actions from log table by selecting unique object_id and setting by doing something as logic 

In [362]:
listIDs=lf["object_id"].unique().tolist()
listUpdateData=[]
for id in listIDs:
    lfTemp=lf.query("object_id==@id")
    # print(lfTemp)
    # print("----------------------------------------------------------------")
    
    
    first_row = lfTemp.iloc[0]
    last_row = lfTemp.iloc[-1]
    # print(first_row)
    # print(last_row)

    if len(lfTemp)==1:
        listUpdateData.append([id,first_row["action"]])
    else:
        if first_row["action"] == "added" and last_row["action"] == "deleted":
            continue
        elif first_row["action"] == "added" and last_row["action"] != "deleted":
            listUpdateData.append([id,"added"])
        else : listUpdateData.append([id,last_row["action"]])
# print(listUpdateData)



# Create  id and action dataframe form filtered rows from log table

In [363]:
print("Convert listUpdate to dataframe")
dfUpdateData = pd.DataFrame(listUpdateData, columns= ['id', 'action'])
dfUpdateData['id'] = dfUpdateData['id'].astype('int64')
print(dfUpdateData.info())
dfUpdateData

Convert listUpdate to dataframe
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      21 non-null     int64 
 1   action  21 non-null     object
dtypes: int64(1), object(1)
memory usage: 464.0+ bytes
None


Unnamed: 0,id,action
0,10,changed
1,12,changed
2,14,changed
3,15,deleted
4,16,deleted
5,18,changed
6,19,changed
7,21,added
8,24,added
9,25,added


In [364]:
print("if the main table is empty , so the action of each row  must be 'added' on temp table")

rows_iter   = client.list_rows(main_table_id, max_results=1) 
if(len(list(rows_iter))==0):
   print(f"No data in {main_table_id}, so all rows in {table_id} action will be 'added' except delete")
   dfUpdateData=dfUpdateData.query("action!='deleted'")
   dfUpdateData['action']='added'
    



if the main table is empty , so the action of each row  must be 'added' on temp table
No data in smart-data-ml.PMReport_Main.pm_plan, so all rows in smart-data-ml.PMReport_Temp.temp_pm_plan action will be 'added' except delete


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfUpdateData['action']='added'


In [365]:
dfUpdateData=dfUpdateData.sort_values(by="id")
dfUpdateData=dfUpdateData.reset_index(drop=True)

print(dfUpdateData.info())
# Display the DataFrame

dfUpdateData

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      19 non-null     int64 
 1   action  19 non-null     object
dtypes: int64(1), object(1)
memory usage: 432.0+ bytes
None


Unnamed: 0,id,action
0,6,added
1,7,added
2,10,added
3,12,added
4,14,added
5,18,added
6,19,added
7,21,added
8,24,added
9,25,added


# Load view and transform

In [366]:
listUpdateIDs = dfUpdateData['id'].tolist()
print(listUpdateIDs)
if len(listUpdateIDs)>1:
 sql_view=f"select *  from {view_name}  where {view_name_id} in {tuple(listUpdateIDs)}"
else:
 sql_view=f"select *  from {view_name}  where {view_name_id} ={listUpdateIDs[0]}"
print(sql_view)
df=list_data(sql_view,None,get_postgres_conn())
print(df.info())
df

[6, 7, 10, 12, 14, 18, 19, 21, 24, 25, 30, 32, 33, 34, 35, 36, 37, 38, 39]
select *  from pmr_pm_plan  where pm_id in (6, 7, 10, 12, 14, 18, 19, 21, 24, 25, 30, 32, 33, 34, 35, 36, 37, 38, 39)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pm_id          19 non-null     int64 
 1   project_id     19 non-null     int64 
 2   planned_date   19 non-null     object
 3   ended_pm_date  19 non-null     object
 4   team_lead      19 non-null     object
dtypes: int64(2), object(3)
memory usage: 888.0+ bytes
None


Unnamed: 0,pm_id,project_id,planned_date,ended_pm_date,team_lead
0,6,14,2023-12-01,2023-12-30,Pongthorn Sangkaphet
1,7,14,2023-12-16,2023-12-31,Pongthorn Sangkaphet
2,14,20,2024-01-01,2024-01-15,Boonlert Kawta
3,25,21,2023-12-13,2023-12-31,Chaipus Aupatamwipanon
4,18,21,2024-01-01,2024-04-29,Chatchai Glaewtanong
5,10,17,2023-12-09,2024-12-06,Titiphun Paisanphong
6,21,20,2023-12-01,2023-12-15,Jitchanok Nakeiam
7,24,18,2023-12-01,2024-01-31,Pairoj Janomrung
8,30,20,2023-12-15,2023-12-16,Suwat Sirivutcharungchit
9,32,23,2024-01-01,2024-01-31,Pongthorn Sangkaphet


# Merge LogDF and ViewDF

In [367]:
merged_df = pd.merge(df, dfUpdateData, left_on=view_name_id, right_on='id', how='inner')
merged_df = merged_df.drop(columns=['id'])
merged_df

Unnamed: 0,pm_id,project_id,planned_date,ended_pm_date,team_lead,action
0,6,14,2023-12-01,2023-12-30,Pongthorn Sangkaphet,added
1,7,14,2023-12-16,2023-12-31,Pongthorn Sangkaphet,added
2,14,20,2024-01-01,2024-01-15,Boonlert Kawta,added
3,25,21,2023-12-13,2023-12-31,Chaipus Aupatamwipanon,added
4,18,21,2024-01-01,2024-04-29,Chatchai Glaewtanong,added
5,10,17,2023-12-09,2024-12-06,Titiphun Paisanphong,added
6,21,20,2023-12-01,2023-12-15,Jitchanok Nakeiam,added
7,24,18,2023-12-01,2024-01-31,Pairoj Janomrung,added
8,30,20,2023-12-15,2023-12-16,Suwat Sirivutcharungchit,added
9,32,23,2024-01-01,2024-01-31,Pongthorn Sangkaphet,added


# Get Deleted Items  to Create deleted dataframe by using listDeleted
## if there is one deletd row then  we will merge it to master dataframe

In [368]:
listSelected = df[view_name_id].tolist()
print(listSelected)

set1 = set(listUpdateIDs)
set2 = set(listSelected)
listDeleted = list(set1.symmetric_difference(set2))

print(listDeleted)

if len(listDeleted)>0:
    print("There are some deleted rows")
    dfDeleted=pd.DataFrame(data=listDeleted,columns=[view_name_id])
    dfDeleted['action']='deleted'
    print(dfDeleted)
    
    merged_df=pd.concat([merged_df,dfDeleted],axis=0)
    
else:
    print("No row deleted")
    


[6, 7, 14, 25, 18, 10, 21, 24, 30, 32, 34, 35, 36, 33, 37, 38, 12, 39, 19]
[]
No row deleted


# Final Transformation

In [369]:
merged_df['imported_at']=dt_imported
merged_df=merged_df.reset_index(drop=True  )
merged_df

Unnamed: 0,pm_id,project_id,planned_date,ended_pm_date,team_lead,action,imported_at
0,6,14,2023-12-01,2023-12-30,Pongthorn Sangkaphet,added,2023-12-27 00:49:53
1,7,14,2023-12-16,2023-12-31,Pongthorn Sangkaphet,added,2023-12-27 00:49:53
2,14,20,2024-01-01,2024-01-15,Boonlert Kawta,added,2023-12-27 00:49:53
3,25,21,2023-12-13,2023-12-31,Chaipus Aupatamwipanon,added,2023-12-27 00:49:53
4,18,21,2024-01-01,2024-04-29,Chatchai Glaewtanong,added,2023-12-27 00:49:53
5,10,17,2023-12-09,2024-12-06,Titiphun Paisanphong,added,2023-12-27 00:49:53
6,21,20,2023-12-01,2023-12-15,Jitchanok Nakeiam,added,2023-12-27 00:49:53
7,24,18,2023-12-01,2024-01-31,Pairoj Janomrung,added,2023-12-27 00:49:53
8,30,20,2023-12-15,2023-12-16,Suwat Sirivutcharungchit,added,2023-12-27 00:49:53
9,32,23,2024-01-01,2024-01-31,Pongthorn Sangkaphet,added,2023-12-27 00:49:53


# Insert data to BQ data frame

In [370]:
if get_bq_table():
    try:
        insertDataFrameToBQ(merged_df)
    except Exception as ex:
        raise ex

Table smart-data-ml.PMReport_Temp.temp_pm_plan already exists.
[SchemaField('pm_id', 'INTEGER', 'NULLABLE', None, None, (), None), SchemaField('project_id', 'INTEGER', 'NULLABLE', None, None, (), None), SchemaField('planned_date', 'DATE', 'NULLABLE', None, None, (), None), SchemaField('ended_pm_date', 'DATE', 'NULLABLE', None, None, (), None), SchemaField('team_lead', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('imported_at', 'DATETIME', 'REQUIRED', None, None, (), None), SchemaField('action', 'STRING', 'NULLABLE', None, None, (), None)]
Total  19 Imported data to smart-data-ml.PMReport_Temp.temp_pm_plan on bigquery successfully


In [371]:
updater["metadata"][view_name].value=dt_imported.strftime("%Y-%m-%d %H:%M:%S")
updater.update_file() 

<ConfigUpdater [
    <Section: 'metadata' [
        <Option: pmr_project = '2023-12-01 00:05:00'>
        <Option: pmr_pm_plan = '2023-12-27 00:49:53'>
        <Option: pmr_pm_item = '2023-12-01 00:05:00'>
        <Option: pmr_inventory = '2023-12-01 00:05:00'>
    ]>
]>