## Accuracy: 100%
Things to note

1.	08/07/2023	Res num: 8,959,169 PNR is null in snow

In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret
from datetime import datetime
import snowflake.snowpark.functions as F
import numpy as np

In [2]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
session = Session.builder.configs(connection_parameters).create()

In [3]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "ANALYTICS_PROD"."IOATAWARE"
Current Warehouse: "COMPUTE_WH"


# Import and clean snowflake views

In [4]:
view_name = "VW_PAYMENTS_RAW_DATA"
date = "08/02/2023"

In [5]:
snowpark_df = session.table(view_name)

In [6]:
snowflake_pd_df = snowpark_df.filter( (F.col("Payment Date") == date) ).to_pandas()
#snowflake_pd_df2 = snowpark_df.filter(F.col('Flight Date') == flight_date2 ).to_pandas() 

In [7]:
snowflake_pd_df.dtypes

Payment Date             object
Reservation Nmbr          int32
PNR                      object
Payment Amount(CAN)     float64
Payment Method           object
Payment Description      object
Payer                    object
Payment Notes            object
Receipt Number            int32
Base Currency            object
Base Currency Amount    float64
Exchange Rate           float64
dtype: object

In [8]:
#snowflake_pd_df.to_csv("temp_snowflake.csv")

In [9]:
# columns_to_remove = ['\tPercent of Full Leg','Leg Status','Flight Time','Sales Username']
# snowflake_pd_df = snowflake_pd_df.drop(columns=columns_to_remove)

snowflake_pd_df['Payment Date'] = pd.to_datetime(snowflake_pd_df['Payment Date'])
#snowflake_pd_df.fillna('', inplace=True)
snowflake_pd_df.replace('', np.nan, inplace=True)
snowflake_pd_df['Exchange Rate'] = round(snowflake_pd_df['Exchange Rate'],6)
# snowflake_pd_df['Taxes'] = snowflake_pd_df['Taxes'].astype(float)
# snowflake_pd_df['Total Charge'] = snowflake_pd_df['Total Charge'].astype(float)
# snowflake_pd_df['Flight Nmbr'] = snowflake_pd_df['Flight Nmbr'].astype(int)

In [10]:
len(snowflake_pd_df)

16343

# Import and clean report data

In [13]:
report_df = pd.read_csv('Payment/Payments_Raw_Data_0802.csv', skiprows=0)   

In [14]:
report_df.dtypes

Source                              object
dtm_GL_Payments_Date                object
lng_Reservation_Nmbr                 int64
PNR                                 object
PaymentsTotal                       object
str_GL_Payment_Method_Desc          object
str_GL_Payments_Desc                object
str_GL_Payments_Payer               object
str_GL_Payments_Notes               object
lng_GL_Payments_Receipt_Nmbr         int64
str_Currency_Ident1                 object
mny_GL_Currency_Payments_Amount     object
mny_Exchange_Rate                  float64
CC_Num                              object
dtype: object

In [15]:
def process_fee_column(value):
    value = value.replace('$', '')  # Remove $
    value = value.replace(')', '')   # Remove )
    value = value.replace('(', '-') #Replace ( to -
    value = value.replace(',', '') # Remove ,
    return round(float(value),6)

In [16]:
#'LastMod_SalesUser'
columns_to_remove = ['Source','CC_Num']
report_df = report_df.drop(columns=columns_to_remove)

columns_to_process = ['PaymentsTotal','mny_GL_Currency_Payments_Amount']

# Apply the function to specified columns in the dataframe
for col in columns_to_process:
    report_df[col] = report_df[col].apply(process_fee_column)

report_df['dtm_GL_Payments_Date'] = pd.to_datetime(report_df['dtm_GL_Payments_Date'])
report_df['dtm_GL_Payments_Date'] =report_df['dtm_GL_Payments_Date'].dt.strftime('%m/%d/%Y')
report_df['dtm_GL_Payments_Date'] = pd.to_datetime(report_df['dtm_GL_Payments_Date'])

  report_df['dtm_GL_Payments_Date'] = pd.to_datetime(report_df['dtm_GL_Payments_Date'])


In [17]:
report_df.dtypes

dtm_GL_Payments_Date               datetime64[ns]
lng_Reservation_Nmbr                        int64
PNR                                        object
PaymentsTotal                             float64
str_GL_Payment_Method_Desc                 object
str_GL_Payments_Desc                       object
str_GL_Payments_Payer                      object
str_GL_Payments_Notes                      object
lng_GL_Payments_Receipt_Nmbr                int64
str_Currency_Ident1                        object
mny_GL_Currency_Payments_Amount           float64
mny_Exchange_Rate                         float64
dtype: object

In [18]:
len(report_df)

16347

# Compare two dataframes

In [19]:
report_df.columns = snowflake_pd_df.columns 

In [20]:
merged = snowflake_pd_df.merge(report_df, how = 'outer', indicator = True)
snowflake_only = merged[merged['_merge'] == "left_only"]
report_only = merged[merged['_merge'] == "right_only"]

In [21]:
def calculate_df2_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    #matched = merged[merged['_merge'] == "both"]
    df1_only = merged[merged['_merge'] == "left_only"]
    df2_only = merged[merged['_merge'] == "right_only"]

    # Calculate the total number of elements
    total_elements = len(df2)

    # Calculate accuracy as a percentage
    accuracy = ((total_elements - len(df2_only)) / total_elements) * 100

    return accuracy


In [22]:
accuracy_percentage = calculate_df2_accuracy(report_df,snowflake_pd_df)
print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 100.00%


In [23]:
accuracy_percentage = calculate_df2_accuracy(snowflake_pd_df, report_df)
print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 99.98%


# Output the difference file

In [24]:
try:
    snowflake_only.to_csv("Payment/"+"snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("Payment/"+"report_only_record.csv", index=False) 
except:
    print("No report only records")

No snowflake only records
