In [448]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret

In [449]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
session = Session.builder.configs(connection_parameters).create()

In [450]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "ANALYTICS_PROD"."IOATAWARE"
Current Warehouse: "COMPUTE_WH"


# Import and clean snowflake views

In [457]:
view_name = "VW_REVENUE_RAW_DATA"
flight_date = "07/01/2023"
#flight_id = 111554

In [458]:
snowpark_df = session.table(view_name)

In [459]:
#snowpark_df = session.table(view_name)
import snowflake.snowpark.functions as F
snowflake_pd_df = snowpark_df.filter(F.col('Flight Date')  == flight_date ).to_pandas()

In [460]:
#snowflake_pd_df

In [461]:
snowflake_pd_df['combined_timestamp'] = pd.to_datetime(snowflake_pd_df['Flight Date'] + ' ' + snowflake_pd_df['Flight time'])
snowflake_pd_df['combined_timestamp_mt'] = snowflake_pd_df['combined_timestamp'].dt.tz_localize('UTC').dt.tz_convert('US/Mountain')
snowflake_pd_df['Flight Date'] = snowflake_pd_df['combined_timestamp_mt'].dt.date

In [471]:
snowflake_pd_df.columns

Index(['Flight Date', 'Reservation Nmbr', 'Sked Detail Id Nmbr', 'Departure',
       'Arrival', 'Legs Id Nmbr', 'Charge Type', 'Net Charge', 'Taxes',
       'Total Charge', 'Charge Type Desc', 'Flight Nmbr', 'Charges Desc',
       'User Id Nmbr', 'Leg Nmbr', 'Segments Id Nmbr', 'Agency Id Nmbr',
       'Reference', 'Currency Ident', 'Agency Name', 'Transborder',
       'Leg Status', 'Category', 'Sales Username', 'Ancillary Category',
       'Purchase Cnt', 'Classification', 'Channel'],
      dtype='object')

In [432]:
#snowflake_pd_df.dtypes

In [433]:
#Remove the column we are not testing for now

In [463]:
columns_to_remove = ['Charge Date','\tPercent of Full Leg','Flight time','combined_timestamp','combined_timestamp_mt']
snowflake_pd_df = snowflake_pd_df.drop(columns=columns_to_remove)

snowflake_pd_df['Flight Date'] = pd.to_datetime(snowflake_pd_df['Flight Date'])
snowflake_pd_df['Net Charge'] = snowflake_pd_df['Net Charge'].astype(float)
snowflake_pd_df['Taxes'] = snowflake_pd_df['Taxes'].astype(float)
snowflake_pd_df['Total Charge'] = snowflake_pd_df['Total Charge'].astype(float)
snowflake_pd_df['Flight Nmbr'] = snowflake_pd_df['Flight Nmbr'].astype(int)

# Import and clean report data

In [464]:
report_df = pd.read_csv('Revenue/Revenue_Raw_Data_0701.csv', skiprows=0)   

In [465]:
report_df.columns 

Index(['Source', 'FlightDate1', 'ActualFlightDate', 'lng_Reservation_Nmbr',
       'lng_Sked_Detail_Id_Nmbr', 'ChargeDate', 'Departure', 'Arrival',
       'lng_Res_Legs_Id_Nmbr', 'ChargeType', 'NetCharge', 'Taxes',
       'mny_Tax_1_Percentage', 'TotalCharge', 'str_GL_Charge_Type_Desc',
       'str_Flight_Nmbr', 'Percent', 'str_GL_Charges_Desc',
       'lng_Creation_User_Id_Nmbr', 'lng_Leg_Nmbr', 'lng_Res_Segments_Id_Nmbr',
       'lng_Agency_Id_Nmbr', 'Reference', 'str_Currency_Ident', 'Base_Charge',
       'Base_Taxes', 'mny_Exchange_Rate', 'TotalCharge1', 'str_Agency_Name',
       'LastMod_SalesUser', 'Transborder', 'str_Leg_Status', 'Category',
       'AncillaryCategory', 'PurchaseCnt', 'PositiveChargePAX',
       'Classification', 'Channel'],
      dtype='object')

In [466]:
report_df.head()

Unnamed: 0,Source,FlightDate1,ActualFlightDate,lng_Reservation_Nmbr,lng_Sked_Detail_Id_Nmbr,ChargeDate,Departure,Arrival,lng_Res_Legs_Id_Nmbr,ChargeType,...,str_Agency_Name,LastMod_SalesUser,Transborder,str_Leg_Status,Category,AncillaryCategory,PurchaseCnt,PositiveChargePAX,Classification,Channel
0,Amelia,7/1/2023,7/1/2023,5452981,152583,6/25/2023 1:31 PM,YEG,YVR,16441187,1,...,Air Black Box,BaseCC-PH - Jacon Olinares,Domestic,C,FlowThru,Airport Improvement Fee,0,18140874,Mid Stage,Direct
1,Amelia,7/1/2023,7/1/2023,5452981,152583,6/25/2023 2:11 PM,YEG,YVR,16441187,1,...,Air Black Box,BaseCC-PH - Jacon Olinares,Domestic,C,FlowThru,Airport Improvement Fee,0,0,Mid Stage,Direct
2,Amelia,7/1/2023,7/1/2023,5452981,152583,6/25/2023 2:11 PM,YEG,YVR,16441187,1,...,Air Black Box,BaseCC-PH - Jacon Olinares,Domestic,C,FlowThru,Airport Improvement Fee,0,18140874,Mid Stage,Direct
3,Amelia,7/1/2023,7/1/2023,5452981,152583,6/25/2023 1:31 PM,YEG,YVR,16441187,4,...,Air Black Box,BaseCC-PH - Jacon Olinares,Domestic,C,Base,Fare,0,18140874,Mid Stage,Direct
4,Amelia,7/1/2023,7/1/2023,5452981,152583,6/25/2023 2:11 PM,YEG,YVR,16441187,4,...,Air Black Box,BaseCC-PH - Jacon Olinares,Domestic,C,Base,Fare,0,0,Mid Stage,Direct


In [467]:
report_df.dtypes

Source                       object
FlightDate1                  object
ActualFlightDate             object
lng_Reservation_Nmbr          int64
lng_Sked_Detail_Id_Nmbr       int64
ChargeDate                   object
Departure                    object
Arrival                      object
lng_Res_Legs_Id_Nmbr          int64
ChargeType                    int64
NetCharge                    object
Taxes                        object
mny_Tax_1_Percentage         object
TotalCharge                  object
str_GL_Charge_Type_Desc      object
str_Flight_Nmbr               int64
Percent                      object
str_GL_Charges_Desc          object
lng_Creation_User_Id_Nmbr     int64
lng_Leg_Nmbr                  int64
lng_Res_Segments_Id_Nmbr      int64
lng_Agency_Id_Nmbr            int64
Reference                    object
str_Currency_Ident           object
Base_Charge                  object
Base_Taxes                   object
mny_Exchange_Rate             int64
TotalCharge1                

In [468]:
columns_to_remove = ['Source','ActualFlightDate','mny_Tax_1_Percentage','Percent', 'ChargeDate', 'Base_Charge','Base_Taxes','TotalCharge1','mny_Exchange_Rate','PositiveChargePAX']
report_df = report_df.drop(columns=columns_to_remove)

report_df['NetCharge'] = report_df['NetCharge'].str.replace('$', '')
report_df['NetCharge'] = report_df['NetCharge'].str.replace(')', '')
report_df['NetCharge'] = report_df['NetCharge'].str.replace('(', '-')
report_df['NetCharge'] = report_df['NetCharge'].astype(float)

report_df['Taxes'] = report_df['Taxes'].str.replace('$', '')
report_df['Taxes'] = report_df['Taxes'].str.replace(')', '')
report_df['Taxes'] = report_df['Taxes'].str.replace('(', '-')
report_df['Taxes'] = report_df['Taxes'].astype(float)

report_df['TotalCharge'] = report_df['TotalCharge'].str.replace('$', '')
report_df['TotalCharge'] = report_df['TotalCharge'].str.replace(')', '')
report_df['TotalCharge'] = report_df['TotalCharge'].str.replace('(', '-')
report_df['TotalCharge'] = report_df['TotalCharge'].astype(float)

#report_df['Percent'] = report_df['Percent'].str.replace('%', '')
report_df['mny_Exchange_Rate'] = report_df['mny_Exchange_Rate'].astype(float)
report_df['FlightDate1'] = pd.to_datetime(report_df['FlightDate1'])


# Compare two dataframes

In [472]:
snowflake_pd_df =snowflake_pd_df.drop_duplicates()
snowflake_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 173206 entries, 0 to 174305
Data columns (total 28 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Flight Date          173206 non-null  datetime64[ns]
 1   Reservation Nmbr     173206 non-null  int32         
 2   Sked Detail Id Nmbr  173206 non-null  int32         
 3   Departure            173206 non-null  object        
 4   Arrival              173206 non-null  object        
 5   Legs Id Nmbr         173206 non-null  int32         
 6   Charge Type          173206 non-null  int16         
 7   Net Charge           173206 non-null  float64       
 8   Taxes                173206 non-null  float64       
 9   Total Charge         173206 non-null  float64       
 10  Charge Type Desc     173206 non-null  object        
 11  Flight Nmbr          173206 non-null  int32         
 12  Charges Desc         173206 non-null  object        
 13  User Id Nmbr       

In [470]:
report_df = report_df.drop_duplicates()
report_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 83955 entries, 0 to 85787
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   FlightDate1                83955 non-null  datetime64[ns]
 1   lng_Reservation_Nmbr       83955 non-null  int64         
 2   lng_Sked_Detail_Id_Nmbr    83955 non-null  int64         
 3   Departure                  83955 non-null  object        
 4   Arrival                    83955 non-null  object        
 5   lng_Res_Legs_Id_Nmbr       83955 non-null  int64         
 6   ChargeType                 83955 non-null  int64         
 7   NetCharge                  83955 non-null  float64       
 8   Taxes                      83955 non-null  float64       
 9   TotalCharge                83955 non-null  float64       
 10  str_GL_Charge_Type_Desc    83955 non-null  object        
 11  str_Flight_Nmbr            83955 non-null  int64         
 12  str_GL_Ch

In [444]:
report_df.columns = snowflake_pd_df.columns 

ValueError: Length mismatch: Expected axis has 30 elements, new values have 28 elements

In [445]:
report_df = report_df.astype(snowflake_pd_df.dtypes)

KeyError: "Only a column name can be used for the key in a dtype mappings argument. 'Flight Date' not found in columns."

In [415]:
merged = snowflake_pd_df.merge(report_df, how = 'outer', indicator = True)
snowflake_only = merged[merged['_merge'] == "left_only"]
common = merged[merged['_merge'] == "both"]
report_only = merged[merged['_merge'] == "right_only"]

In [416]:
common

Unnamed: 0,Source,Flight Date,Reservation Nmbr,Sked Detail Id Nmbr,Departure,Arrival,Legs Id Nmbr,Charge Type,Net Charge,Taxes,...,Agency Name,Sale_Username,Transborder,Leg Status,Category,AncillaryCategory,PurchaseCnt,Classification,Channel,_merge
0,Amelia,2023-07-01,7871670,116595,YEG,YXX,16168922,5,7.12,0.36,...,Air Black Box,ABBAPI,Domestic,C,FlowThru,Air Traveller Security Charge,0,Short Haul,Direct,both
2,Amelia,2023-07-01,7898074,119901,YEG,YVR,16224434,4,71.21,3.56,...,Air Black Box,ABBAPI,Domestic,C,Base,Fare,0,Mid Stage,Direct,both
7,Amelia,2023-07-01,7806899,148928,YYZ,YYG,16034645,1,35.00,4.55,...,Air Black Box,ABBAPI,Domestic,C,FlowThru,Airport Improvement Fee,0,Mid Stage,Direct,both
8,Amelia,2023-07-01,7805447,112911,YYZ,YEG,16031514,1,35.00,4.55,...,Air Black Box,ABBAPI,Domestic,C,FlowThru,Airport Improvement Fee,0,Long Haul,Direct,both
10,Amelia,2023-07-01,7119505,127170,YKF,CUN,14665587,1,15.00,1.95,...,Air Black Box,ABBAPI,International,C,FlowThru,Airport Improvement Fee,0,Sun,Direct,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183992,Amelia,2023-07-01,7142252,109136,YVR,YYZ,14713882,1,25.00,1.25,...,API - Travelfusion,TRAVELFUSION-F8,Domestic,C,FlowThru,Airport Improvement Fee,0,Long Haul,InDirect,both
184001,Amelia,2023-07-01,7974080,118171,YEG,YOW,16382097,4,147.40,7.37,...,Air Black Box,ABBAPI,Domestic,C,Base,Fare,0,Long Haul,Direct,both
184005,Amelia,2023-07-01,6060939,129176,YLW,YWG,12432671,5,7.12,0.36,...,Flair Air,BaseCC-PH - Anne Juliette Bell,Domestic,X,FlowThru,Air Traveller Security Charge,0,Mid Stage,Direct,both
184006,Amelia,2023-07-01,5408575,115474,YYZ,YSJ,10940759,5,7.12,0.93,...,Air Black Box,ABBAPI,Domestic,C,FlowThru,Air Traveller Security Charge,0,Mid Stage,Direct,both


In [417]:
snowflake_only

Unnamed: 0,Source,Flight Date,Reservation Nmbr,Sked Detail Id Nmbr,Departure,Arrival,Legs Id Nmbr,Charge Type,Net Charge,Taxes,...,Agency Name,Sale_Username,Transborder,Leg Status,Category,AncillaryCategory,PurchaseCnt,Classification,Channel,_merge
1,Amelia,2023-07-01,7898050,109353,YVR,YYZ,16224362,1,-25.00,-1.25,...,Air Black Box,Administration,Domestic,X,FlowThru,Airport Improvement Fee,0,Long Haul,Direct,left_only
3,Amelia,2023-07-01,7899691,124552,YVR,PVR,16227737,5,-25.91,0.00,...,Air Black Box,Administration,International,X,FlowThru,Air Traveller Security Charge,0,Sun,Direct,left_only
4,Amelia,2023-07-01,7872054,144182,YOW,YHZ,16169781,1002,69.00,8.97,...,Air Black Box,ABBAPI,Domestic,X,Ancillary,Basic Bundle,1,Mid Stage,Direct,left_only
5,Amelia,2023-07-01,7760232,151305,YYC,YXX,15936924,5,-7.12,-0.36,...,Air Black Box,Administration,Domestic,X,FlowThru,Air Traveller Security Charge,0,Short Haul,Direct,left_only
6,Amelia,2023-07-01,7801649,102675,YVR,YUL,16023285,4,-254.78,-12.74,...,Air Black Box,Administration,Domestic,X,Base,Fare,0,Long Haul,Direct,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184000,Amelia,2023-07-01,7973763,105136,YOW,YYJ,16381426,1002,83.00,4.15,...,Air Black Box,ABBAPI,Domestic,X,Ancillary,Basic Bundle,1,Long Haul,Direct,left_only
184002,Amelia,2023-07-01,6050101,124444,YYZ,CUN,12411133,1000,0.00,0.00,...,Flair Air,BaseCC-PH - Ruel Velasquez,International,X,Ancillary,Call Center Fee,1,Sun,Direct,left_only
184003,Amelia,2023-07-01,7380537,108863,YOW,YYC,15193216,5,7.12,0.93,...,Air Black Box,ABBAPI,Domestic,X,FlowThru,Air Traveller Security Charge,0,Long Haul,Direct,left_only
184004,Amelia,2023-07-01,7380643,105270,YKF,YHZ,15193456,5,7.12,1.07,...,Air Black Box,ABBAPI,Domestic,X,FlowThru,Air Traveller Security Charge,0,Mid Stage,Direct,left_only


In [418]:
test2 = report_only[(report_only['Legs Id Nmbr'] ==14656731) & (report_only['Charge Type'] == 1)]

In [419]:
#str(test1["Percent of Full Leg"]) == str(test2["Percent of Full Leg"])

In [420]:
def calculate_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    matched = merged[merged['_merge'] == "both"]

    # Calculate the total number of elements
    total_elements = len(df2)

    # Calculate accuracy as a percentage
    accuracy = (len(matched) / total_elements) * 100

    return accuracy


In [421]:
accuracy_percentage = calculate_accuracy(snowflake_pd_df, report_df)
print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 80.83%


# Output the difference file

In [422]:
try:
    snowflake_only.to_csv("snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("report_only_record.csv", index=False) 
except:
    print("No report only records")