## Accuracy: 98.33% 

Things to note:


In [26]:
import snowflake.connector
import getpass
import pandas as pd
import json
from get_secret import get_secret
from datetime import datetime

# Import and clean snowflake views

In [27]:
config_json = json.loads(get_secret())
config_json.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
conn = snowflake.connector.connect(**config_json)

cur = conn.cursor()
query = "SELECT * FROM ANALYTICS_PROD.IOATAWARE.VW_UNFLOWN_REVENUE WHERE \"Flight Year\" >= '2024' and \"Flight Month\" = '4'"
cur.execute(query)
results_df = cur.fetch_pandas_all()

In [28]:
snowflake_pd_df = results_df

In [29]:
#snowflake_pd_df.dtypes

In [30]:
snowflake_pd_df['Charge Date'] = pd.to_datetime(snowflake_pd_df['Charge Date'])
snowflake_pd_df['Flight Date'] = pd.to_datetime(snowflake_pd_df['Flight Date'])
snowflake_pd_df['Flight Year'] = snowflake_pd_df['Flight Year'].astype(int)
snowflake_pd_df['Flight Month'] = snowflake_pd_df['Flight Month'].astype(int)
snowflake_pd_df['Net Charge Total'] = snowflake_pd_df['Net Charge Total'].astype(float)
snowflake_pd_df['Taxes Total'] = snowflake_pd_df['Taxes Total'].astype(float)
snowflake_pd_df['Charge Total'] = snowflake_pd_df['Charge Total'].astype(float)

# Import and clean report data

In [31]:
report_df = pd.read_csv('Unflown_Revenue/Unflown_Revenue_2404.csv', skiprows=0)   

In [32]:
#report_df.dtypes 

In [33]:
def process_fee_column(value):
    value = value.replace('$', '')  # Remove $
    value = value.replace(')', '')   # Remove )
    value = value.replace('(', '-')   # Replace ( with -
    return float(value)

In [34]:
#'LastMod_SalesUser''PurchaseCnt','AncillaryCategory','Base_Taxes'

columns_to_process = ['NetChargeTotal', 'TaxesTotal','ChargeTotal']

# Apply the function to specified columns in the dataframe
for col in columns_to_process:
    report_df[col] = report_df[col].apply(process_fee_column)

report_df['ChargeDateSimple'] = pd.to_datetime(report_df['ChargeDateSimple'])
report_df['FlightDate'] = pd.to_datetime(report_df['FlightDate'])

# Compare two dataframes

In [None]:
report_df.columns = snowflake_pd_df.columns 

In [42]:
def calculate_df2_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    df1_only = merged[merged['_merge'] == "left_only"]
    df2_only = merged[merged['_merge'] == "right_only"]


    # Calculate accuracy as a percentage
    accuracy_df1 = ((len(df1) - len(df1_only)) / len(df1)) * 100
    accuracy_df2 = ((len(df2) - len(df2_only)) / len(df2)) * 100

    return accuracy_df1,accuracy_df2,df1_only,df2_only


In [43]:
report_accuracy,snowflake_accuracy,report_only,snowflake_only = calculate_df2_accuracy(report_df,snowflake_pd_df)

In [44]:
print(f"Accuracy: {snowflake_accuracy:.2f}%" " snowflake records in report\nThe number of records in snowflake but not in report is", len(snowflake_only))
print(f"Accuracy: {report_accuracy:.2f}%"" report records in snowflake\nThe number of records in report but not in snowflake is", len(report_only))

Accuracy: 16.40% snowflake records in report
The number of records in snowflake but not in report is 9017
Accuracy: 98.06% report records in snowflake
The number of records in report but not in snowflake is 35


# Output the difference file

In [45]:
try:
    snowflake_only.to_csv("Unflown_Revenue/"+"snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("Unflown_Revenue/"+"report_only_record.csv", index=False) 
except:
    print("No report only records")

In [46]:
snow_res = list(snowflake_pd_df['Reservation #'].unique())
report_res = list(report_df['Reservation #'].unique())

In [47]:
elements_in_snow_not_in_report = list(set(snow_res) - set(report_res))
elements_in_report_not_in_snow = list(set(report_res)-set(snow_res))

snow_file_path = "Unflown_Revenue/snowonly_reservation.txt"
report_file_path = "Unflown_Revenue/reportonly_reservation.txt"

# Open the file for writing
with open(snow_file_path, "w") as file:
    for item in elements_in_snow_not_in_report:
        file.write("%s," % item)
with open(report_file_path, "w") as file:
    for item in elements_in_snow_not_in_report:
        file.write("%s," % item)