## Accuracy: 98.31%

Things to note:
1. 

In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret
from datetime import datetime
import snowflake.snowpark.functions as F

In [2]:
def process_fee_column(value):
    value = value.replace('$', '')  # Remove $
    value = value.replace(')', '')   # Remove )
    value = value.replace('(', '-')   # Replace ( with -
    return float(value)

In [3]:
def convert_to_hms(time_string):
    input_formats = ["%I:%M %p", "%I:%M:%S %p"]
    output_format = "%H:%M:%S"
    
    formatted_time = None
    
    for input_format in input_formats:
        try:
            input_time = datetime.strptime(time_string, input_format)
            formatted_time = input_time.strftime(output_format)
            break
        except ValueError:
            pass
    
    return formatted_time

In [4]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
session = Session.builder.configs(connection_parameters).create()

In [5]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

# Import and clean snowflake views

In [6]:
view_name = "VW_REFUNDS_RAW_DATA"
date = "08/01/2023"

In [7]:
snowpark_df = session.table(view_name)

In [8]:
snowflake_pd_df = snowpark_df.filter(F.col('Refund Date') == date ).to_pandas()
len(snowflake_pd_df)
#snowflake_pd_df2 = snowpark_df.filter(F.col('Flight Date') == flight_date2 ).to_pandas() 

In [9]:
snowflake_pd_df.dtypes

In [28]:
#need change in the future 'Purchase Cnt','Ancillary Category','Resident Base Taxes'
# columns_to_remove = ['Refund Time']
# snowflake_pd_df = snowflake_pd_df.drop(columns=columns_to_remove).
snowflake_pd_df['Refund Date'] = pd.to_datetime(snowflake_pd_df['Refund Date'])
snowflake_pd_df['Refund Time'] = snowflake_pd_df['Refund Time'].apply(convert_to_hms)
snowflake_pd_df['Exchange Rate'] = round(snowflake_pd_df['Exchange Rate'],6)
snowflake_pd_df['Refund Amount (CAN)'] = snowflake_pd_df['Refund Amount (CAN)'].astype(float)
snowflake_pd_df['Base Currency Amount'] = snowflake_pd_df['Base Currency Amount'].astype(float)
snowflake_pd_df['lng GL Payments Receipt Nmbr'] = snowflake_pd_df['lng GL Payments Receipt Nmbr'].astype(int)

# Import and clean report data

In [29]:
report_df = pd.read_csv('Refunds/Refunds_Raw_Data_0801.csv', skiprows=0)   

In [30]:
#'LastMod_SalesUser''PurchaseCnt','AncillaryCategory','Base_Taxes'
columns_to_remove = ['CC_Num']
report_df = report_df.drop(columns=columns_to_remove)

columns_to_process = ['RefundTotal', 'mny_GL_Currency_Payments_Amount']

# Apply the function to specified columns in the dataframe
for col in columns_to_process:
    report_df[col] = report_df[col].apply(process_fee_column)

report_df['dtm_Refund_Date'] = pd.to_datetime(report_df['dtm_Refund_Date'])
report_df['dtm_Refund_Date'] =report_df['dtm_Refund_Date'].dt.strftime('%m/%d/%Y')
report_df['dtm_Refund_Date'] = pd.to_datetime(report_df['dtm_Refund_Date'])

report_df['dtm_Refund_Date1'] = report_df['dtm_Refund_Date1'].apply(convert_to_hms)

In [31]:
len(report_df)

In [32]:
report_df.dtypes

In [15]:
report_df = report_df.fillna('')

In [16]:
#report_df.dtypes = snowflake_pd_df.dtypes
report_df.columns = snowflake_pd_df.columns 

# Compare two dataframes

In [17]:
merged = snowflake_pd_df.merge(report_df, how = 'outer', indicator = True)
snowflake_only = merged[merged['_merge'] == "left_only"]
report_only = merged[merged['_merge'] == "right_only"]

In [18]:
def calculate_df2_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    #matched = merged[merged['_merge'] == "both"]
    df1_only = merged[merged['_merge'] == "left_only"]
    df2_only = merged[merged['_merge'] == "right_only"]

    # Calculate the total number of elements
    total_elements = len(df2)

    # Calculate accuracy as a percentage
    accuracy = ((total_elements - len(df2_only)) / total_elements) * 100

    return accuracy


In [26]:
accuracy_percentage = calculate_df2_accuracy(report_df,snowflake_pd_df)
print(f"Accuracy: {accuracy_percentage:.2f}%" " snowflake records in report\nThe number of records in snowflake but not in report is", len(snowflake_only))

In [20]:
accuracy_percentage = calculate_df2_accuracy(snowflake_pd_df, report_df)
print(f"Accuracy: {accuracy_percentage:.2f}%"" report records in snowflake\nThe number of records in report but not in snowflake is", len(report_only))

# Output the difference file

In [25]:
try:
    snowflake_only.to_csv("Refunds/"+"snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("Refunds/"+"report_only_record.csv", index=False) 
except:
    print("No report only records")

In [22]:
snow_res = list(snowflake_pd_df['Reservation Nmbr'].unique())
report_res = list(report_df['Reservation Nmbr'].unique())

In [23]:
elements_in_snow_not_in_report = list(set(snow_res) - set(report_res))
elements_in_report_not_in_snow = list(set(report_res)-set(snow_res))

snow_file_path = "Refunds/snowonly_reservation.txt"
report_file_path = "Refunds/reportonly_reservation.txt"

# Open the file for writing
with open(snow_file_path, "w") as file:
    for item in elements_in_snow_not_in_report:
        file.write("%s," % item)
with open(report_file_path, "w") as file:
    for item in elements_in_snow_not_in_report:
        file.write("%s," % item)