In [2]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret

In [3]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
session = Session.builder.configs(connection_parameters).create()

In [5]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "ANALYTICS_PROD"."IOATAWARE"
Current Warehouse: "COMPUTE_WH"


# Import and clean snowflake views

In [8]:
view_name = "VW_AIF_REPORT_DETAIL"
flight_date = "2023/06/03"
flight_id = 111554

In [401]:
snowpark_df = session.table(view_name)
snowflake_pd_df = snowpark_df.filter((snowpark_df['Flight Date'] == flight_date) & (snowpark_df["Flight ID"] == flight_id)).to_pandas()
#snowflake_pd_df = snowflake_pd_df[snowflake_pd_df['Flight Date'] == flight_date & snowflake_pd_df["Flight ID"] == flight_id]

In [402]:
snowflake_pd_df['combined_timestamp'] = pd.to_datetime(snowflake_pd_df['Flight Date'] + ' ' + snowflake_pd_df['Flight time'])
snowflake_pd_df['combined_timestamp_mt'] = snowflake_pd_df['combined_timestamp'].dt.tz_localize('UTC').dt.tz_convert('US/Mountain')
snowflake_pd_df['Flight Date'] = snowflake_pd_df['combined_timestamp_mt'].dt.date

In [None]:
snowflake_pd_df['combined_timestamp'] = pd.to_datetime(snowflake_pd_df['Flight Date'] + ' ' + snowflake_pd_df['Flight time'])
snowflake_pd_df['combined_timestamp_mt'] = snowflake_pd_df['combined_timestamp'].dt.tz_localize('UTC').dt.tz_convert('US/Mountain')
snowflake_pd_df['Flight Date'] = snowflake_pd_df['combined_timestamp_mt'].dt.date

In [404]:
snowflake_pd_df.dtypes

Flight Date                                   object
Flight time                                   object
Flight ID                                      int32
Departure                                     object
Arrival                                       object
Reservation Number                             int32
Leg Number                                      int8
PNR                                           object
Check In Boarded                              object
Status                                        object
First Name                                    object
Last Name                                     object
Charge Type                                   object
Net Charge                                    object
Leg Start                                     object
Leg End                                       object
Charge Date                                   object
combined_timestamp                    datetime64[ns]
combined_timestamp_mt    datetime64[ns, US/Mou

In [405]:
#Remove the column we are not testing for now

In [406]:
columns_to_remove = ['First Name','Flight ID', 'Last Name','Charge Date', 'combined_timestamp','combined_timestamp_mt','Flight time']
snowflake_pd_df = snowflake_pd_df.drop(columns=columns_to_remove)
snowflake_pd_df['Flight Date'] = pd.to_datetime(snowflake_pd_df['Flight Date'])
snowflake_pd_df['Reservation Number'] = snowflake_pd_df['Reservation Number'].astype(int)
snowflake_pd_df['Leg Number'] = snowflake_pd_df['Leg Number'].astype(int)
snowflake_pd_df['Net Charge'] = snowflake_pd_df['Net Charge'].astype(float)

In [407]:
snowflake_pd_df

Unnamed: 0,Flight Date,Departure,Arrival,Reservation Number,Leg Number,PNR,Check In Boarded,Status,Charge Type,Net Charge,Leg Start,Leg End
0,2023-06-03,YYC,YYZ,5024160,1,T7PXE2,B,C,Airport Improvement Fee,35.0,YYC,YYZ
1,2023-06-03,YYC,YYZ,7291116,2,ZJZQ3N,B,C,Airport Improvement Fee,35.0,YYC,YYZ
2,2023-06-03,YYC,YYZ,7346621,2,CATTCF,B,C,Airport Improvement Fee,35.0,YYC,YYZ
3,2023-06-03,YYC,YYZ,7659990,1,QW3FAF,B,C,Airport Improvement Fee,35.0,YYC,YYZ
4,2023-06-03,YYC,YYZ,4673871,2,CFD22S,B,C,Airport Improvement Fee,35.0,YYC,YYZ
...,...,...,...,...,...,...,...,...,...,...,...,...
150,2023-06-03,YYC,YYZ,7493237,1,ZUKAQ4,B,C,Airport Improvement Fee,35.0,YYC,YYZ
151,2023-06-03,YYC,YYZ,7291116,2,ZJZQ3N,B,C,Airport Improvement Fee,35.0,YYC,YYZ
152,2023-06-03,YYC,YYZ,7284065,1,FXP4K5,B,C,Airport Improvement Fee,35.0,YYC,YYZ
153,2023-06-03,YYC,YYZ,5477840,1,4NZUKN,B,C,Airport Improvement Fee,35.0,YYC,YYZ


In [408]:
snowflake_pd_df.dtypes

Flight Date           datetime64[ns]
Departure                     object
Arrival                       object
Reservation Number             int32
Leg Number                     int32
PNR                           object
Check In Boarded              object
Status                        object
Charge Type                   object
Net Charge                   float64
Leg Start                     object
Leg End                       object
dtype: object

# Import and clean report data

In [409]:
report_df = pd.read_csv('AIF_Report_Detail_111554.csv', skiprows=3)   

In [410]:
report_df.columns 

Index(['FlightDate1', 'Departure1', 'Arrival1', 'lng_Reservation_Nmbr',
       'lng_Leg_Nmbr', 'Reference', 'str_Res_CheckIn_Boarded',
       'str_Res_Status', 'str_First_Name', 'str_Last_Name',
       'str_GL_Charge_Type_Desc', 'NetCharge', 'LegDeparture', 'LegArrival',
       'ChargeDate'],
      dtype='object')

In [411]:
report_df.dtypes

FlightDate1                object
Departure1                 object
Arrival1                   object
lng_Reservation_Nmbr        int64
lng_Leg_Nmbr                int64
Reference                  object
str_Res_CheckIn_Boarded    object
str_Res_Status             object
str_First_Name             object
str_Last_Name              object
str_GL_Charge_Type_Desc    object
NetCharge                  object
LegDeparture               object
LegArrival                 object
ChargeDate                 object
dtype: object

In [412]:
report_df

Unnamed: 0,FlightDate1,Departure1,Arrival1,lng_Reservation_Nmbr,lng_Leg_Nmbr,Reference,str_Res_CheckIn_Boarded,str_Res_Status,str_First_Name,str_Last_Name,str_GL_Charge_Type_Desc,NetCharge,LegDeparture,LegArrival,ChargeDate
0,2023/06/03,YYC,YYZ,5166995,2,XVXP78,B,C,MATT GREGORY,VISITACION,Airport Improvement Fee,$35.00,YYC,YYZ,11/29/2022
1,2023/06/03,YYC,YYZ,5401723,1,2SDFYX,B,C,CHUNGYOUNG,LIM,Airport Improvement Fee,$35.00,YYC,YYZ,12/29/2022
2,2023/06/03,YYC,YYZ,5397156,2,3TKS25,B,C,UTKARSH,SHARMA,Airport Improvement Fee,$35.00,YYC,YYZ,12/28/2022
3,2023/06/03,YYC,YYZ,5401723,1,2SDFYX,B,C,SEUNGYEOB,LIM,Airport Improvement Fee,$35.00,YYC,YYZ,12/29/2022
4,2023/06/03,YYC,YYZ,5401723,1,2SDFYX,B,C,HEESOOK,KIM,Airport Improvement Fee,$35.00,YYC,YYZ,12/29/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,2023/06/03,YYC,YYZ,7550127,2,TNPX9H,B,C,YAN KWONG IAN,CHAN,Airport Improvement Fee,$35.00,YYC,YYZ,5/16/2023
151,2023/06/03,YYC,YYZ,7550127,2,TNPX9H,B,C,SIU HONG,SUEN,Airport Improvement Fee,$35.00,YYC,YYZ,5/16/2023
152,2023/06/03,YYC,YYZ,7563605,2,GC6G88,B,C,MANOJ,CHATTERJEE,Airport Improvement Fee,$35.00,YYC,YYZ,5/17/2023
153,2023/06/03,YYC,YYZ,7563605,2,GC6G88,B,C,SHYAMALI,CHATTERJEE,Airport Improvement Fee,$35.00,YYC,YYZ,5/17/2023


In [413]:
columns_to_remove = ['str_First_Name', 'str_Last_Name','ChargeDate']
report_df = report_df.drop(columns=columns_to_remove)

report_df['FlightDate1'] = pd.to_datetime(report_df['FlightDate1'])
report_df['lng_Reservation_Nmbr'] = report_df['lng_Reservation_Nmbr'].astype(int)
report_df['lng_Leg_Nmbr'] = report_df['lng_Leg_Nmbr'].astype(int)
report_df['NetCharge'] = report_df['NetCharge'].str.replace('$', '')
report_df['NetCharge'] = report_df['NetCharge'].str.replace(')', '')
report_df['NetCharge'] = report_df['NetCharge'].str.replace('(', '-')
report_df['NetCharge'] = report_df['NetCharge'].astype(float)

# Compare two dataframes

In [414]:
snowflake_pd_df =snowflake_pd_df.drop_duplicates()
#snowflake_pd_df

In [415]:
report_df = report_df.drop_duplicates()
#report_df

In [416]:
snowflake_pd_df.columns = report_df.columns

In [417]:
merged = snowflake_pd_df.merge(report_df, how = 'outer', indicator = True)
snowflake_only = merged[merged['_merge'] == "left_only"]
common = merged[merged['_merge'] == "both"]
report_only = merged[merged['_merge'] == "right_only"]

In [418]:
common

Unnamed: 0,FlightDate1,Departure1,Arrival1,lng_Reservation_Nmbr,lng_Leg_Nmbr,Reference,str_Res_CheckIn_Boarded,str_Res_Status,str_GL_Charge_Type_Desc,NetCharge,LegDeparture,LegArrival,_merge
0,2023-06-03,YYC,YYZ,5024160,1,T7PXE2,B,C,Airport Improvement Fee,35.0,YYC,YYZ,both
1,2023-06-03,YYC,YYZ,7291116,2,ZJZQ3N,B,C,Airport Improvement Fee,35.0,YYC,YYZ,both
2,2023-06-03,YYC,YYZ,7346621,2,CATTCF,B,C,Airport Improvement Fee,35.0,YYC,YYZ,both
3,2023-06-03,YYC,YYZ,7659990,1,QW3FAF,B,C,Airport Improvement Fee,35.0,YYC,YYZ,both
4,2023-06-03,YYC,YYZ,4673871,2,CFD22S,B,C,Airport Improvement Fee,35.0,YYC,YYZ,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,2023-06-03,YYC,YYZ,7166918,2,ASBVKN,B,C,Airport Improvement Fee,35.0,YYC,YYZ,both
88,2023-06-03,YYC,YYZ,7294824,2,T4DMNZ,B,C,Airport Improvement Fee,35.0,YYC,YYZ,both
89,2023-06-03,YYC,YYZ,5477840,1,4NZUKN,B,C,Airport Improvement Fee,35.0,YYC,YYZ,both
90,2023-06-03,YYC,YYZ,7155627,1,XFNGQS,B,C,Airport Improvement Fee,35.0,YYC,YYZ,both


In [419]:
def calculate_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    matched = merged[merged['_merge'] == "both"]

    # Calculate the total number of elements
    total_elements = len(df2)

    # Calculate accuracy as a percentage
    accuracy = (len(matched) / total_elements) * 100

    return accuracy


In [420]:
accuracy_percentage = calculate_accuracy(snowflake_pd_df, report_df)
print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 100.00%


# Output the difference file

In [422]:
try:
    snowflake_only.to_csv("snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("report_only_record.csv", index=False) 
except:
    print("No report only records")

No snowflake only records
No report only records
