In [729]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret
from datetime import datetime
import snowflake.snowpark.functions as F

In [730]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
session = Session.builder.configs(connection_parameters).create()

In [731]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "ANALYTICS_PROD"."IOATAWARE"
Current Warehouse: "COMPUTE_WH"


# Import and clean snowflake views

In [732]:
view_name = "VW_REVENUE_RAW_DATA"
flight_date1 = "07/01/2023"
flight_date2 = "07/02/2023"

In [733]:
snowpark_df = session.table(view_name)

In [744]:
snowflake_pd_df1 = snowpark_df.filter(F.col('Flight Date') == flight_date1 ).to_pandas()
snowflake_pd_df2 = snowpark_df.filter(F.col('Flight Date') == flight_date2 ).to_pandas() 

In [791]:
snowflake_pd_df = pd.concat([snowflake_pd_df1,snowflake_pd_df2]).reset_index()

In [792]:
snowflake_pd_df['combined_timestamp'] = pd.to_datetime(snowflake_pd_df['Flight Date'] + ' ' + snowflake_pd_df['Flight time'])
snowflake_pd_df['combined_timestamp_mt'] = snowflake_pd_df['combined_timestamp'].dt.tz_localize('UTC').dt.tz_convert('US/Mountain')
snowflake_pd_df['Flight Date'] = snowflake_pd_df['combined_timestamp_mt'].dt.date

#snowflake_pd_df['Charge Date'] = pd.to_datetime(snowflake_pd_df['Charge Date']).dt.tz_localize('UTC').dt.tz_convert('US/Mountain')

In [793]:
snowflake_pd_df.columns

Index(['index', 'Flight Date', 'Flight time', 'Reservation Nmbr',
       'Sked Detail Id Nmbr', 'Charge Date', 'Departure', 'Arrival',
       'Legs Id Nmbr', 'Charge Type', 'Net Charge', 'Taxes', 'Total Charge',
       'Charge Type Desc', 'Flight Nmbr', '\tPercent of Full Leg',
       'Charges Desc', 'User Id Nmbr', 'Leg Nmbr', 'Segments Id Nmbr',
       'Agency Id Nmbr', 'Reference', 'Currency Ident', 'Agency Name',
       'Sales Username', 'Transborder', 'Leg Status', 'Category',
       'Ancillary Category', 'Purchase Cnt', 'Classification', 'Channel',
       'combined_timestamp', 'combined_timestamp_mt'],
      dtype='object')

In [794]:
#snowflake_pd_df.dtypes

In [795]:
#Remove the column we are not testing for now

In [796]:
columns_to_remove = ['index','Charge Date','\tPercent of Full Leg','Flight time','combined_timestamp','combined_timestamp_mt','Leg Status']
snowflake_pd_df = snowflake_pd_df.drop(columns=columns_to_remove)

snowflake_pd_df['Flight Date'] = pd.to_datetime(snowflake_pd_df['Flight Date'])
snowflake_pd_df['Net Charge'] = snowflake_pd_df['Net Charge'].astype(float)
snowflake_pd_df['Taxes'] = snowflake_pd_df['Taxes'].astype(float)
snowflake_pd_df['Total Charge'] = snowflake_pd_df['Total Charge'].astype(float)
snowflake_pd_df['Flight Nmbr'] = snowflake_pd_df['Flight Nmbr'].astype(int)

In [811]:
snowflake_pd_df = snowflake_pd_df[snowflake_pd_df['Flight Date'] == flight_date1]

# Import and clean report data

In [797]:
report_df = pd.read_csv('Revenue/Revenue_Raw_Data_0701.csv', skiprows=0)   

In [798]:
report_df.columns 

Index(['Source', 'FlightDate1', 'ActualFlightDate', 'lng_Reservation_Nmbr',
       'lng_Sked_Detail_Id_Nmbr', 'ChargeDate', 'Departure', 'Arrival',
       'lng_Res_Legs_Id_Nmbr', 'ChargeType', 'NetCharge', 'Taxes',
       'mny_Tax_1_Percentage', 'TotalCharge', 'str_GL_Charge_Type_Desc',
       'str_Flight_Nmbr', 'Percent', 'str_GL_Charges_Desc',
       'lng_Creation_User_Id_Nmbr', 'lng_Leg_Nmbr', 'lng_Res_Segments_Id_Nmbr',
       'lng_Agency_Id_Nmbr', 'Reference', 'str_Currency_Ident', 'Base_Charge',
       'Base_Taxes', 'mny_Exchange_Rate', 'TotalCharge1', 'str_Agency_Name',
       'LastMod_SalesUser', 'Transborder', 'str_Leg_Status', 'Category',
       'AncillaryCategory', 'PurchaseCnt', 'PositiveChargePAX',
       'Classification', 'Channel'],
      dtype='object')

In [799]:
columns_to_remove = ['str_Leg_Status','Source','ActualFlightDate','mny_Tax_1_Percentage','Percent', 'ChargeDate', 'Base_Charge','Base_Taxes','TotalCharge1','mny_Exchange_Rate','PositiveChargePAX']
report_df = report_df.drop(columns=columns_to_remove)

report_df['NetCharge'] = report_df['NetCharge'].str.replace('$', '')
report_df['NetCharge'] = report_df['NetCharge'].str.replace(')', '')
report_df['NetCharge'] = report_df['NetCharge'].str.replace('(', '-')
report_df['NetCharge'] = report_df['NetCharge'].astype(float)

report_df['Taxes'] = report_df['Taxes'].str.replace('$', '')
report_df['Taxes'] = report_df['Taxes'].str.replace(')', '')
report_df['Taxes'] = report_df['Taxes'].str.replace('(', '-')
report_df['Taxes'] = report_df['Taxes'].astype(float)

report_df['TotalCharge'] = report_df['TotalCharge'].str.replace('$', '')
report_df['TotalCharge'] = report_df['TotalCharge'].str.replace(')', '')
report_df['TotalCharge'] = report_df['TotalCharge'].str.replace('(', '-')
report_df['TotalCharge'] = report_df['TotalCharge'].astype(float)

#report_df['Percent'] = report_df['Percent'].str.replace('%', '')
#report_df['mny_Exchange_Rate'] = report_df['mny_Exchange_Rate'].astype(float)
report_df['FlightDate1'] = pd.to_datetime(report_df['FlightDate1'])


In [800]:
report_df

Unnamed: 0,FlightDate1,lng_Reservation_Nmbr,lng_Sked_Detail_Id_Nmbr,Departure,Arrival,lng_Res_Legs_Id_Nmbr,ChargeType,NetCharge,Taxes,TotalCharge,...,Reference,str_Currency_Ident,str_Agency_Name,LastMod_SalesUser,Transborder,Category,AncillaryCategory,PurchaseCnt,Classification,Channel
0,2023-07-01,5452981,152583,YEG,YVR,16441187,1,35.00,1.75,36.75,...,DFZS4K,CAD,Air Black Box,BaseCC-PH - Jacon Olinares,Domestic,FlowThru,Airport Improvement Fee,0,Mid Stage,Direct
1,2023-07-01,5452981,152583,YEG,YVR,16441187,1,-35.00,-1.75,-36.75,...,DFZS4K,CAD,Air Black Box,BaseCC-PH - Jacon Olinares,Domestic,FlowThru,Airport Improvement Fee,0,Mid Stage,Direct
2,2023-07-01,5452981,152583,YEG,YVR,16441187,1,35.00,1.75,36.75,...,DFZS4K,CAD,Air Black Box,BaseCC-PH - Jacon Olinares,Domestic,FlowThru,Airport Improvement Fee,0,Mid Stage,Direct
3,2023-07-01,5452981,152583,YEG,YVR,16441187,4,52.16,2.61,54.77,...,DFZS4K,CAD,Air Black Box,BaseCC-PH - Jacon Olinares,Domestic,Base,Fare,0,Mid Stage,Direct
4,2023-07-01,5452981,152583,YEG,YVR,16441187,4,-52.16,-2.61,-54.77,...,DFZS4K,CAD,Air Black Box,BaseCC-PH - Jacon Olinares,Domestic,Base,Fare,0,Mid Stage,Direct
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85783,2023-07-01,8075512,118171,YEG,YOW,16558543,4,337.89,16.89,354.78,...,H6WPDD,CAD,Air Black Box,ABBAPI,Domestic,Base,Fare,0,Long Haul,Direct
85784,2023-07-01,8075512,118171,YEG,YOW,16558543,5,7.12,0.36,7.48,...,H6WPDD,CAD,Air Black Box,ABBAPI,Domestic,FlowThru,Air Traveller Security Charge,0,Long Haul,Direct
85785,2023-07-01,8075512,118171,YEG,YOW,16558543,1001,30.00,1.50,31.50,...,H6WPDD,CAD,Air Black Box,ABBAPI,Domestic,Ancillary,Seat Assignment,1,Long Haul,Direct
85786,2023-07-01,8075512,118171,YEG,YOW,16558543,1002,15.00,0.75,15.75,...,H6WPDD,CAD,Air Black Box,ABBAPI,Domestic,Ancillary,ACF - Prepaid,1,Long Haul,Direct


# Compare two dataframes

In [812]:
set_snowflake = set(map(tuple, snowflake_pd_df.to_numpy()))
set_report = set(map(tuple, report_df.to_numpy()))
common_rows = set_snowflake.intersection(set_report)

In [813]:
in_both_df = snowflake_pd_df[snowflake_pd_df.apply(tuple, axis=1).isin(common_rows)]
only_in_snowflake_df = snowflake_pd_df[~snowflake_pd_df.apply(tuple, axis=1).isin(common_rows)]
only_in_report_df = report_df[~report_df.apply(tuple, axis=1).isin(common_rows)]


In [814]:
in_both_df

Unnamed: 0,Flight Date,Reservation Nmbr,Sked Detail Id Nmbr,Departure,Arrival,Legs Id Nmbr,Charge Type,Net Charge,Taxes,Total Charge,...,Reference,Currency Ident,Agency Name,Sales Username,Transborder,Category,Ancillary Category,Purchase Cnt,Classification,Channel
0,2023-07-01,7807970,113128,YWG,YYZ,16036896,1002,79.00,3.95,82.95,...,BNAJ2S,CAD,Air Black Box,ABBAPI,Domestic,Ancillary,Basic Bundle,1,Mid Stage,Direct
1,2023-07-01,7807970,113128,YWG,YYZ,16036896,1,-38.00,-1.90,-39.90,...,BNAJ2S,CAD,Air Black Box,ABBAPI,Domestic,FlowThru,Airport Improvement Fee,0,Mid Stage,Direct
2,2023-07-01,8044666,152583,YEG,YVR,16509587,4,152.16,7.61,159.77,...,MD3JAH,CAD,Air Black Box,ABBAPI,Domestic,Base,Fare,0,Mid Stage,Direct
3,2023-07-01,7910855,115566,YSJ,YYZ,16250474,5,7.12,0.93,8.05,...,V2EM7M,CAD,Air Black Box,ABBAPI,Domestic,FlowThru,Air Traveller Security Charge,0,Mid Stage,Direct
6,2023-07-01,7465553,147440,YHZ,YUL,15368484,5,7.12,1.07,8.19,...,UGY66H,CAD,API - VoyageALaCarte,FLIGHTHUBAPI,Domestic,FlowThru,Air Traveller Security Charge,0,Mid Stage,InDirect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334210,2023-07-01,7883502,147499,YHZ,YYZ,16193728,5,7.12,0.93,8.05,...,P54UN4,CAD,API - Expedia,EXPEDIA-F8,Domestic,FlowThru,Air Traveller Security Charge,0,Mid Stage,InDirect
334219,2023-07-01,7569853,151905,YYC,YVR,15579561,1,35.00,1.75,36.75,...,WQRCP8,CAD,Air Black Box,ABBAPI,Domestic,FlowThru,Airport Improvement Fee,0,Short Haul,Direct
334222,2023-07-01,7036877,111787,YYZ,YYC,14494063,5,7.12,0.93,8.05,...,FC5ZHG,CAD,Air Black Box,ABBAPI,Domestic,FlowThru,Air Traveller Security Charge,0,Long Haul,Direct
334223,2023-07-01,5641905,124620,PVR,YVR,11488929,4,193.61,7.74,201.35,...,Z585Q9,CAD,Air Black Box,ABBAPI,International,Base,Fare,0,Sun,Direct


# Accuracy and output the difference file

In [820]:
print("accuracy is: ",(len(in_both_df)/len(report_df))* 100)

accuracy is:  85.75791485988717


In [823]:
try:
    only_in_snowflake_df.to_csv("snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    only_in_report_df.to_csv("report_only_record.csv", index=False) 
except:
    print("No report only records")

In [None]:
def calculate_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    matched = merged[merged['_merge'] == "both"]

    # Calculate the total number of elements
    total_elements = len(df2)

    # Calculate accuracy as a percentage
    accuracy = (len(matched) / total_elements) * 100

    return accuracy


In [821]:
#report_df.columns = snowflake_pd_df.columns 

In [822]:
#report_df = report_df.astype(snowflake_pd_df.dtypes)

In [665]:
merged = snowflake_pd_df.merge(report_df, how = 'outer', indicator = True)
snowflake_only = merged[merged['_merge'] == "left_only"]
common = merged[merged['_merge'] == "both"]
report_only = merged[merged['_merge'] == "right_only"]

In [666]:
common

Unnamed: 0,Flight Date,Reservation Nmbr,Sked Detail Id Nmbr,Departure,Arrival,Legs Id Nmbr,Charge Type,Net Charge,Taxes,Total Charge,...,Currency Ident,Agency Name,Sales Username,Transborder,Category,Ancillary Category,Purchase Cnt,Classification,Channel,_merge


In [542]:
#str(test1["Percent of Full Leg"]) == str(test2["Percent of Full Leg"])

In [643]:
def calculate_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    matched = merged[merged['_merge'] == "both"]

    # Calculate the total number of elements
    total_elements = len(df2)

    # Calculate accuracy as a percentage
    accuracy = (len(matched) / total_elements) * 100

    return accuracy


In [644]:
accuracy_percentage = calculate_accuracy(snowflake_pd_df, report_df)
print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 67.77%


# Output the difference file

In [545]:
try:
    snowflake_only.to_csv("snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("report_only_record.csv", index=False) 
except:
    print("No report only records")