## columns not tested yet, 'Leg Status','Sales Username','Charge Date'

In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret
from datetime import datetime
import snowflake.snowpark.functions as F

In [2]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
session = Session.builder.configs(connection_parameters).create()

In [3]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "ANALYTICS_PROD"."IOATAWARE"
Current Warehouse: "COMPUTE_WH"


In [4]:
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore', category=Warning)

# Import and clean snowflake views

In [5]:
view_name = "VW_REVENUE_RAW_DATA"
flight_date1 = "07/30/2023"
flight_date2 = "07/31/2023"

In [6]:
snowpark_df = session.table(view_name)

In [7]:
snowflake_pd_df1 = snowpark_df.filter(F.col('Flight Date') == flight_date1 ).to_pandas()
snowflake_pd_df2 = snowpark_df.filter(F.col('Flight Date') == flight_date2 ).to_pandas() 

In [8]:
snowflake_pd_df = pd.concat([snowflake_pd_df1,snowflake_pd_df2]).reset_index()

In [9]:
# snowflake_pd_df['combined_timestamp'] = pd.to_datetime(snowflake_pd_df['Flight Date'] + ' ' + snowflake_pd_df['Flight time'])
# snowflake_pd_df['combined_timestamp_mt'] = snowflake_pd_df['combined_timestamp'].dt.tz_localize('UTC').dt.tz_convert('US/Mountain')
# snowflake_pd_df['Flight Date'] = snowflake_pd_df['combined_timestamp_mt'].dt.date
#snowflake_pd_df['Charge Date'] = pd.to_datetime(snowflake_pd_df['Charge Date']).dt.tz_localize('UTC').dt.tz_convert('US/Mountain')

In [10]:
snowflake_pd_df = snowflake_pd_df[snowflake_pd_df['Flight Date'] == "07/30/2023"]

In [11]:
snowflake_pd_df.columns

Index(['index', 'Flight Date', 'Reservation Nmbr', 'Sked Detail Id Nmbr',
       'Charge Date', 'Departure', 'Arrival', 'Legs Id Nmbr', 'Charge Type',
       'Net Charge', 'Taxes', 'Total Charge', 'Charge Type Desc',
       'Flight Nmbr', '\tPercent of Full Leg', 'Charges Desc', 'User Id Nmbr',
       'Leg Nmbr', 'Segments Id Nmbr', 'Agency Id Nmbr', 'Reference',
       'Currency Ident', 'Agency Name', 'Sales Username', 'Transborder',
       'Leg Status', 'Category', 'Ancillary Category', 'Purchase Cnt',
       'Classification', 'Channel'],
      dtype='object')

In [12]:
#Remove the column we are not testing for now

In [13]:
columns_to_remove = ['index','Charge Date','\tPercent of Full Leg','Leg Status','Sales Username']
snowflake_pd_df = snowflake_pd_df.drop(columns=columns_to_remove)

snowflake_pd_df['Flight Date'] = pd.to_datetime(snowflake_pd_df['Flight Date'])
snowflake_pd_df['Net Charge'] = snowflake_pd_df['Net Charge'].astype(float)
snowflake_pd_df['Taxes'] = snowflake_pd_df['Taxes'].astype(float)
snowflake_pd_df['Total Charge'] = snowflake_pd_df['Total Charge'].astype(float)
snowflake_pd_df['Flight Nmbr'] = snowflake_pd_df['Flight Nmbr'].astype(int)

In [14]:
snowflake_pd_df = snowflake_pd_df[snowflake_pd_df['Flight Date'] == flight_date1]

# Import and clean report data

In [15]:
report_df = pd.read_csv('Revenue/Revenue_Raw_Data_0730_Beta.csv', skiprows=0)   

In [16]:
report_df.columns

Index(['Source', 'FlightDate1', 'ActualFlightDate', 'lng_Reservation_Nmbr',
       'lng_Sked_Detail_Id_Nmbr', 'ChargeDate', 'Departure', 'Arrival',
       'lng_Res_Legs_Id_Nmbr', 'ChargeType', 'NetCharge', 'Taxes',
       'mny_Tax_1_Percentage', 'TotalCharge', 'str_GL_Charge_Type_Desc',
       'str_Flight_Nmbr', 'Percent', 'str_GL_Charges_Desc',
       'lng_Creation_User_Id_Nmbr', 'lng_Leg_Nmbr', 'lng_Res_Segments_Id_Nmbr',
       'lng_Agency_Id_Nmbr', 'Reference', 'str_Currency_Ident', 'Base_Charge',
       'Base_Taxes', 'mny_Exchange_Rate', 'TotalCharge1', 'str_Agency_Name',
       'LastMod_SalesUser', 'Transborder', 'str_Leg_Status', 'Category',
       'AncillaryCategory', 'PurchaseCnt', 'PositiveChargePAX',
       'Classification', 'Channel', 'CancellationDate', 'PaxStatus',
       'ChargeStatus', 'SeatSold'],
      dtype='object')

In [17]:
columns_to_remove = ['LastMod_SalesUser','str_Leg_Status','Source','ActualFlightDate','mny_Tax_1_Percentage','Percent', 'ChargeDate', 'Base_Charge','Base_Taxes','TotalCharge1','mny_Exchange_Rate','PositiveChargePAX','CancellationDate',
       'PaxStatus', 'ChargeStatus', 'SeatSold']
report_df = report_df.drop(columns=columns_to_remove)

report_df['NetCharge'] = report_df['NetCharge'].str.replace('$', '')
report_df['NetCharge'] = report_df['NetCharge'].str.replace(')', '')
report_df['NetCharge'] = report_df['NetCharge'].str.replace('(', '-')
report_df['NetCharge'] = report_df['NetCharge'].astype(float)

report_df['Taxes'] = report_df['Taxes'].str.replace('$', '')
report_df['Taxes'] = report_df['Taxes'].str.replace(')', '')
report_df['Taxes'] = report_df['Taxes'].str.replace('(', '-')
report_df['Taxes'] = report_df['Taxes'].astype(float)

report_df['TotalCharge'] = report_df['TotalCharge'].str.replace('$', '')
report_df['TotalCharge'] = report_df['TotalCharge'].str.replace(')', '')
report_df['TotalCharge'] = report_df['TotalCharge'].str.replace('(', '-')
report_df['TotalCharge'] = report_df['TotalCharge'].astype(float)

#report_df['Percent'] = report_df['Percent'].str.replace('%', '')
#report_df['mny_Exchange_Rate'] = report_df['mny_Exchange_Rate'].astype(float)
report_df['FlightDate1'] = pd.to_datetime(report_df['FlightDate1'])
report_df['FlightDate1'] = report_df['FlightDate1'].dt.strftime('%m/%d/%Y')
report_df['FlightDate1'] = pd.to_datetime(report_df['FlightDate1'])

In [18]:
report_df.columns = snowflake_pd_df.columns 

# Compare two dataframes

In [19]:
merged = snowflake_pd_df.merge(report_df, how = 'outer', indicator = True)
snowflake_only = merged[merged['_merge'] == "left_only"]
report_only = merged[merged['_merge'] == "right_only"]

In [20]:
def calculate_df2_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    #matched = merged[merged['_merge'] == "both"]
    df1_only = merged[merged['_merge'] == "left_only"]
    df2_only = merged[merged['_merge'] == "right_only"]

    # Calculate the total number of elements
    total_elements = len(df2)

    # Calculate accuracy as a percentage
    accuracy = ((total_elements - len(df2_only)) / total_elements) * 100

    return accuracy


In [21]:
accuracy_percentage = calculate_df2_accuracy(report_df,snowflake_pd_df)
print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 99.13%


In [22]:
accuracy_percentage = calculate_df2_accuracy(snowflake_pd_df, report_df)
print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 98.99%


# Output the difference file

In [23]:
try:
    snowflake_only.to_csv("snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("report_only_record.csv", index=False) 
except:
    print("No report only records")