In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret
from snowflake.snowpark.session import Session
import snowflake.connector

In [2]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})

# Set up the Snowflake connection parameters
config_json = json.loads(get_secret())
config_json.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})

# Create a connection object
conn = snowflake.connector.connect(**config_json)

In [3]:
# print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
# print(f"Current Warehouse: {session.get_current_warehouse()}")

# Import and clean snowflake views

In [4]:
# Create a cursor object to execute SQL queries
cur = conn.cursor()

# Execute a sample SQL query
query = "select * from ANALYTICS_PROD.IOATAWARE.VW_AIF_REPORT where \"FlightDate\" ='06/03/2023' "
cur.execute(query)

# Fetch the results
results_df = cur.fetch_pandas_all()

In [5]:
snowflake_pd_df = results_df

In [6]:
snowflake_pd_df.dtypes

Flight_ID                 int32
FlightDate               object
FlightNum                object
Departure                object
Arrival                  object
Tail_Identifier          object
Total_PAX_OnBoard         int64
Total_PAX_Enplaned        int64
Total_PAX_NonRev          int64
Total_PAX_FlowThru         int8
Total_PAX_Connecting       int8
Total_PAX_Deplaned        int64
Total_PAX_Deplaned2       int64
CAPACITY                float64
dtype: object

In [7]:
snowflake_pd_df

Unnamed: 0,Flight_ID,FlightDate,FlightNum,Departure,Arrival,Tail_Identifier,Total_PAX_OnBoard,Total_PAX_Enplaned,Total_PAX_NonRev,Total_PAX_FlowThru,Total_PAX_Connecting,Total_PAX_Deplaned,Total_PAX_Deplaned2,CAPACITY
0,122516,06/03/2023,1607,SFB,YYZ,918,148,148,0,0,0,148,0,189.0
1,104615,06/03/2023,303,YHZ,YOW,912,131,131,0,0,0,131,0,189.0
2,148131,06/03/2023,758,YLW,YEG,913,144,141,3,0,0,144,0,189.0
3,148173,06/03/2023,759,YEG,YLW,913,146,146,0,0,0,146,0,189.0
4,113534,06/03/2023,644,YWG,YYZ,A320CJL,148,148,0,0,0,148,0,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,123418,06/03/2023,1777,LAS,YEG,909,136,136,0,0,0,136,0,189.0
75,106785,06/03/2023,503,YYC,YXX,913,121,121,0,0,0,121,0,189.0
76,148828,06/03/2023,1710,YEG,LAX,803,80,79,1,0,0,80,0,189.0
77,112666,06/03/2023,638,YEG,YYZ,803,157,152,5,0,0,157,0,189.0


In [8]:
columns_to_remove = ['Total_PAX_Deplaned2','CAPACITY']
snowflake_pd_df = snowflake_pd_df.drop(columns=columns_to_remove)
snowflake_pd_df['FlightDate'] = pd.to_datetime(snowflake_pd_df['FlightDate'])
snowflake_pd_df['FlightNum'] = snowflake_pd_df['FlightNum'].astype(int)

In [9]:
snowflake_pd_df.dtypes

Flight_ID                        int32
FlightDate              datetime64[ns]
FlightNum                        int32
Departure                       object
Arrival                         object
Tail_Identifier                 object
Total_PAX_OnBoard                int64
Total_PAX_Enplaned               int64
Total_PAX_NonRev                 int64
Total_PAX_FlowThru                int8
Total_PAX_Connecting              int8
Total_PAX_Deplaned               int64
dtype: object

# Import and clean report data

In [10]:
report_df = pd.read_csv('AIF_REPORT/AIF_Report_0603.csv', skiprows=3)   

In [11]:
report_df.dtypes

Flight_ID                int64
FlightDate              object
FlightNum                int64
Departure               object
Arrival                 object
Tail_Identifier         object
Total_PAX_OnBoard        int64
Total_PAX_Enplaned       int64
Total_PAX_NonRev         int64
Total_PAX_FlowThru       int64
Total_PAX_Connecting     int64
Total_PAX_Deplaned       int64
Total_PAX_Deplaned2      int64
Capacity                 int64
dtype: object

In [12]:
columns_to_remove = ['Total_PAX_Deplaned2','Capacity']
report_df = report_df.drop(columns=columns_to_remove)

report_df['FlightDate'] = pd.to_datetime(report_df['FlightDate'])

# Compare two dataframes

In [13]:
snowflake_pd_df.columns = report_df.columns

In [14]:
merged = snowflake_pd_df.merge(report_df, how = 'outer', indicator = True)
snowflake_only = merged[merged['_merge'] == "left_only"]
common = merged[merged['_merge'] == "both"]
report_only = merged[merged['_merge'] == "right_only"]

In [15]:
def calculate_df2_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    #matched = merged[merged['_merge'] == "both"]
    df1_only = merged[merged['_merge'] == "left_only"]
    df2_only = merged[merged['_merge'] == "right_only"]

    # Calculate the total number of elements
    total_elements = len(df2)

    # Calculate accuracy as a percentage
    accuracy = ((total_elements - len(df2_only)) / total_elements) * 100

    return accuracy


In [16]:
accuracy_percentage = calculate_df2_accuracy(report_df,snowflake_pd_df)
print(f"Accuracy: {accuracy_percentage:.2f}%" " snowflake records in report\nThe number of records in snowflake but not in report is", len(snowflake_only))

Accuracy: 98.73% snowflake records in report
The number of records in snowflake but not in report is 1


In [17]:
accuracy_percentage = calculate_df2_accuracy(snowflake_pd_df, report_df)
print(f"Accuracy: {accuracy_percentage:.2f}%"" report records in snowflake\nThe number of records in report but not in snowflake is", len(report_only))

Accuracy: 98.73% report records in snowflake
The number of records in report but not in snowflake is 1


# Output the difference file

In [18]:
try:
    snowflake_only.to_csv("AIF_Report/snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("AIF_Report/report_only_record.csv", index=False) 
except:
    print("No report only records")