In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret
from datetime import datetime
import snowflake.snowpark.functions as F

In [2]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
session = Session.builder.configs(connection_parameters).create()

In [3]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "ANALYTICS_PROD"."IOATAWARE"
Current Warehouse: "COMPUTE_WH"


# Import and clean snowflake views

In [33]:
view_name = "VW_MONTHLY_REVENUE"

In [34]:
snowpark_df = session.table(view_name)

In [35]:
snowflake_pd_df = snowpark_df.to_pandas()

In [41]:
snowflake_pd_df['Flight Date'] = pd.to_datetime(snowflake_pd_df['Flight Date'])
snowflake_pd_df['Daily Base Revenue'] = snowflake_pd_df['Daily Base Revenue'].astype(float)
snowflake_pd_df['Daily Ancillary Revenue'] = snowflake_pd_df['Daily Ancillary Revenue'].astype(float)
snowflake_pd_df['Total'] = snowflake_pd_df['Total'].astype(float)

In [20]:
# snowflake_pd_df['combined_timestamp'] = pd.to_datetime(snowflake_pd_df['Flight Date'] + ' ' + snowflake_pd_df['Flight time'])
# snowflake_pd_df['combined_timestamp_mt'] = snowflake_pd_df['combined_timestamp'].dt.tz_localize('UTC').dt.tz_convert('US/Mountain')
# snowflake_pd_df['Flight Date'] = snowflake_pd_df['combined_timestamp_mt'].dt.date

#snowflake_pd_df['Charge Date'] = pd.to_datetime(snowflake_pd_df['Charge Date']).dt.tz_localize('UTC').dt.tz_convert('US/Mountain')

In [38]:
start_date = '2022-07-31'
end_date = '2023-07-31'
mask  = (snowflake_pd_df['Flight Date'] > start_date) & (snowflake_pd_df['Flight Date'] <= end_date)
snowflake_pd_df1 = snowflake_pd_df.loc[mask]

In [54]:
snowflake_pd_df1[snowflake_pd_df1["Flight Date"] == "2023-06-19	"]

Unnamed: 0,Flight Date,Daily Base Revenue,Daily Ancillary Revenue,Total
0,2023-06-19,874179.47,544661.35,1418840.82


In [795]:
#Remove the column we are not testing for now

# Import and clean report data

In [42]:
report_df = pd.read_csv('MonthlyRevenue/Monthly_Revenue_Summary.csv', skiprows=0)   

In [43]:
report_df.columns 

Index(['FlightDate', 'DailyBaseRevenue', 'DailyAncillaryRevenue', 'Total'], dtype='object')

In [48]:
report_df['DailyBaseRevenue'] = report_df['DailyBaseRevenue'].str.replace('$', '')
report_df['DailyBaseRevenue'] = report_df['DailyBaseRevenue'].str.replace(',', '')
report_df['DailyAncillaryRevenue'] = report_df['DailyAncillaryRevenue'].str.replace('$', '')
report_df['DailyAncillaryRevenue'] = report_df['DailyAncillaryRevenue'].str.replace(',', '')
report_df['Total'] = report_df['Total'].str.replace('$', '')
report_df['Total'] = report_df['Total'].str.replace(',', '')

In [49]:
report_df['FlightDate'] = pd.to_datetime(report_df['FlightDate'])
report_df['DailyBaseRevenue'] = report_df['DailyBaseRevenue'].astype(float)
report_df['DailyAncillaryRevenue'] = report_df['DailyAncillaryRevenue'].astype(float)
report_df['Total'] = report_df['Total'].astype(float)

In [60]:
report_df[report_df["FlightDate"] == "2023-06-19"]

Unnamed: 0,FlightDate,DailyBaseRevenue,DailyAncillaryRevenue,Total
323,2023-06-19,882676.34,706390.84,1589067.18


In [50]:
report_df

Unnamed: 0,FlightDate,DailyBaseRevenue,DailyAncillaryRevenue,Total
0,2022-07-31,1563392.70,732167.89,2295560.59
1,2022-08-01,1875110.19,735743.09,2610853.28
2,2022-08-02,1401295.00,655608.04,2056903.04
3,2022-08-03,1433307.82,657460.27,2090768.09
4,2022-08-04,1213723.53,633084.74,1846808.26
...,...,...,...,...
361,2023-07-27,1287112.40,849252.17,2136364.57
362,2023-07-28,1509426.89,634531.07,2143957.96
363,2023-07-29,1357027.59,566700.95,1923728.54
364,2023-07-30,1545718.81,541945.63,2087664.44


# Compare two dataframes

In [61]:
set_snowflake = set(map(tuple, snowflake_pd_df.to_numpy()))
set_report = set(map(tuple, report_df.to_numpy()))
common_rows = set_snowflake.intersection(set_report)

In [62]:
in_both_df = snowflake_pd_df[snowflake_pd_df.apply(tuple, axis=1).isin(common_rows)]
only_in_snowflake_df = snowflake_pd_df[~snowflake_pd_df.apply(tuple, axis=1).isin(common_rows)]
only_in_report_df = report_df[~report_df.apply(tuple, axis=1).isin(common_rows)]


In [63]:
in_both_df

Unnamed: 0,Flight Date,Daily Base Revenue,Daily Ancillary Revenue,Total


# Accuracy and output the difference file

In [820]:
print("accuracy is: ",(len(in_both_df)/len(report_df))* 100)

accuracy is:  85.75791485988717


In [823]:
try:
    only_in_snowflake_df.to_csv("snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    only_in_report_df.to_csv("report_only_record.csv", index=False) 
except:
    print("No report only records")

In [665]:
merged = snowflake_pd_df.merge(report_df, how = 'outer', indicator = True)
snowflake_only = merged[merged['_merge'] == "left_only"]
common = merged[merged['_merge'] == "both"]
report_only = merged[merged['_merge'] == "right_only"]

In [666]:
common

Unnamed: 0,Flight Date,Reservation Nmbr,Sked Detail Id Nmbr,Departure,Arrival,Legs Id Nmbr,Charge Type,Net Charge,Taxes,Total Charge,...,Currency Ident,Agency Name,Sales Username,Transborder,Category,Ancillary Category,Purchase Cnt,Classification,Channel,_merge


In [542]:
#str(test1["Percent of Full Leg"]) == str(test2["Percent of Full Leg"])

In [643]:
def calculate_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    matched = merged[merged['_merge'] == "both"]

    # Calculate the total number of elements
    total_elements = len(df2)

    # Calculate accuracy as a percentage
    accuracy = (len(matched) / total_elements) * 100

    return accuracy


In [644]:
accuracy_percentage = calculate_accuracy(snowflake_pd_df, report_df)
print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 67.77%


# Output the difference file

In [545]:
try:
    snowflake_only.to_csv("snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("report_only_record.csv", index=False) 
except:
    print("No report only records")