In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret
from datetime import datetime
import snowflake.snowpark.functions as F

In [2]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
session = Session.builder.configs(connection_parameters).create()

In [3]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "ANALYTICS_PROD"."IOATAWARE"
Current Warehouse: "COMPUTE_WH"


In [4]:
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore', category=Warning)

# For snowflake views

In [5]:
view_name = "VW_SALES_RAW_DATA"
date1 = "07/30/2023"
date2 = "07/31/2023"

In [6]:
snowpark_df = session.table(view_name)

In [7]:
snowflake_pd_df = snowpark_df.filter(F.col('Charge Date') == date1).to_pandas()
#snowflake_pd_df2 = snowpark_df.filter(F.col('Flight Date') == flight_date2).to_pandas() 

In [8]:
len(snowflake_pd_df)

97945

In [9]:
duplicates = snowflake_pd_df[snowflake_pd_df.duplicated() == True]
print( "Exact Duplicate records percentage: ",(len(duplicates)/len(snowflake_pd_df))*100,"%")

Exact Duplicate records percentage:  0.6197355658788095 %


In [10]:
duplicates

Unnamed: 0,Charge Date,Reservation Nmbr,Sked Detail Id Nmbr,Flight Date,Flight Time,Departure,Arrival,Legs Id Nmbr,Charge Type,Net Charge,...,Sales Username,Ancillary Category,Purchase Cnt,Classification,Channel,Residency,Resident Exchange Rate,Resident Base Charge,Resident Base Discount,Resident Base Taxes
1614,07/30/2023,7651807,107482.0,07/31/2023,17:30:00,YYC,YKF,15746526,1002,100.00,...,ABBAPI,Basic Bundle,1,Long Haul,InDirect,CAD,1.0,100.00,0.0,5.00
1918,07/30/2023,8137354,108774.0,08/03/2023,11:40:00,YYC,YOW,16659966,1,-35.00,...,ABBAPI,Airport Improvement Fee,0,Long Haul,Direct,CAD,1.0,-35.00,0.0,-1.75
2083,07/30/2023,7169789,104516.0,08/08/2023,13:15:00,YOW,YHZ,14770740,1002,82.00,...,ABBAPI,Basic Bundle,1,Mid Stage,Direct,CAD,1.0,82.00,0.0,12.30
2090,07/30/2023,7169789,104516.0,08/08/2023,13:15:00,YOW,YHZ,14770740,1002,82.00,...,ABBAPI,Basic Bundle,1,Mid Stage,Direct,CAD,1.0,82.00,0.0,12.30
2319,07/30/2023,8137354,108774.0,08/03/2023,11:40:00,YYC,YOW,16659966,5,7.12,...,ABBAPI,Air Traveller Security Charge,0,Long Haul,Direct,CAD,1.0,7.12,0.0,0.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97444,07/30/2023,7526982,127725.0,08/04/2023,19:15:00,YXU,YVR,15492896,1,12.00,...,ABBAPI,Airport Improvement Fee,0,Long Haul,Direct,CAD,1.0,12.00,0.0,1.56
97528,07/30/2023,8829722,145875.0,08/31/2023,07:00:00,YVR,LAX,18260137,1018,5.15,...,ABBAPI,US APHIS User Fee,0,Sun,Direct,CAD,1.0,5.15,0.0,0.00
97737,07/30/2023,7169789,104516.0,08/08/2023,13:15:00,YOW,YHZ,14770740,1002,82.00,...,ABBAPI,Basic Bundle,1,Mid Stage,Direct,CAD,1.0,82.00,0.0,12.30
97757,07/30/2023,7651807,107482.0,07/31/2023,17:30:00,YYC,YKF,15746526,1002,100.00,...,ABBAPI,Basic Bundle,1,Long Haul,InDirect,CAD,1.0,100.00,0.0,5.00


In [11]:
#snowflake_pd_df = pd.concat([snowflake_pd_df1,snowflake_pd_df2]).reset_index()

In [12]:
#snowflake_pd_df = snowflake_pd_df[snowflake_pd_df['Flight Date'] == "07/30/2023"]

In [13]:

#columns_check = ['Legs Id Nmbr','Charge Date','Total Charge' ,'Charges Desc']

In [14]:
#duplicate_mask = snowflake_pd_df.duplicated(subset=columns_check, keep=False)

# Filter the DataFrame to show only the duplicate rows
#duplicates_df = snowflake_pd_df[duplicate_mask]

In [15]:
#duplicates_df.to_csv("Duplicate_records.csv")

In [16]:
#print( "Duplicate records percentage: ",(len(duplicates_df)/len(snowflake_pd_df))*100,"%")

# For report

In [17]:
reprot_path= 'Sales/Sales_Raw_Data_0730.csv'

In [18]:
report_df = pd.read_csv(reprot_path, skiprows=0)   

In [19]:
len(report_df)

57728

In [20]:
report_select = report_df[['lng_Reservation_Nmbr','TotalCharge']]

report_select['TotalCharge'] = report_select['TotalCharge'].str.replace('$', '')
report_select['TotalCharge'] = report_select['TotalCharge'].str.replace(')', '')
report_select['TotalCharge'] = report_select['TotalCharge'].str.replace('(', '-')
report_select['TotalCharge'] = report_select['TotalCharge'].astype(float)

In [21]:
iot_count_df = report_select.groupby(by =['lng_Reservation_Nmbr']).size().to_frame()
iot_sum_df = report_select.groupby(by =['lng_Reservation_Nmbr']).sum()
iot_df = iot_sum_df.join(iot_count_df, on = 'lng_Reservation_Nmbr', how= 'left',lsuffix='sum')
iot_df.columns = ['TotalCharge','Count']

In [22]:
iot_df['Reservation Number'] = iot_df.index
iot_df = iot_df.reset_index(drop=True)

# Check duplicates for report

In [23]:
columns_check = ['lng_Res_Legs_Id_Nmbr','ChargeDate','TotalCharge' ,'str_GL_Charges_Desc']

In [24]:
duplicates = report_df[report_df.duplicated() == True]
print( "Exact Duplicate records percentage: ",(len(duplicates)/len(report_df))*100,"%")

Exact Duplicate records percentage:  0.22172949002217296 %


# Missing records

In [25]:
snow_df = snowflake_pd_df[['Reservation Nmbr','Total Charge']]

snow_df['Total Charge'] = snow_df['Total Charge'].astype(float)

In [26]:
snow_count_df = snow_df.groupby(by =['Reservation Nmbr']).size().to_frame()
snow_sum_df = snow_df.groupby(by =['Reservation Nmbr']).sum()
snow_result_df = snow_sum_df.join(snow_count_df, on = 'Reservation Nmbr', how= 'left',lsuffix='sum')
snow_result_df.columns = ['Total Charge','Count']

In [27]:
snow_result_df['Reservation Number'] = snow_result_df.index
snow_result_df = snow_result_df.reset_index(drop=True)

In [28]:
iot_df.columns = snow_result_df.columns

In [29]:
iot_df['Total Charge'] = iot_df['Total Charge'].round(2)
snow_result_df['Total Charge'] = snow_result_df['Total Charge'].round(2)

In [30]:
iot_df.sum()

Total Charge          2.101863e+06
Count                 5.772800e+04
Reservation Number    6.114632e+10
dtype: float64

In [31]:
snow_result_df.sum()

Total Charge          2.101717e+06
Count                 9.794500e+04
Reservation Number    8.067182e+10
dtype: float64

In [32]:
#iot_df

In [33]:
missing_in_report =  pd.merge(snow_result_df, iot_df, on='Reservation Number', how='outer',suffixes=('_snowflake', '_iot'))
missing_in_report = missing_in_report.fillna(0)

In [34]:
missing_in_report ['Difference abs Charge'] = abs(missing_in_report['Total Charge_snowflake'] - missing_in_report['Total Charge_iot'])
missing_in_report ['Difference abs Count'] = abs(missing_in_report['Count_snowflake'] - missing_in_report['Count_iot'])
missing_in_report ['Difference Count'] = missing_in_report['Count_snowflake'] - missing_in_report['Count_iot']

In [35]:
missing = missing_in_report[(missing_in_report['Difference abs Charge'] != 0) | (missing_in_report['Difference abs Count'] != 0)]

In [36]:
reservation_number = duplicates['lng_Reservation_Nmbr'].unique()
final_missing_df = missing[~missing['Reservation Number'].isin(reservation_number)]

In [37]:
final_missing_df.set_index('Reservation Number', inplace=True)

In [38]:
final_df_sum = final_missing_df.sum()
final_missing_df.loc['Sum'] = final_df_sum

In [39]:
#final_missing_df

In [40]:
final_missing_df.to_csv("final_missing_records.csv")

# Summary Table

In [41]:
df = pd.DataFrame()
df['Date'] = [date1]
df['Total count of records'] = [len(report_df)]
df['Total charge of records'] = [report_select['TotalCharge'].sum() ]
df['Percentage of duplicate records'] = [(len(duplicates)/len(report_df))*100]
df['Number of duplicate records'] = [len(duplicates)]
df['Percentage of missing records'] = [((final_missing_df.loc['Sum','Difference abs Count'])/len(report_df))*100 ]
df['Number of missing records'] = [(final_missing_df.loc['Sum','Difference abs Count'])]
df['Number of missing records in report'] = [final_missing_df[final_missing_df['Difference Count'] >= 0]['Difference abs Count'].sum() - final_missing_df['Difference abs Count']['Sum']] 
df['Number of missing records in snowflake'] = [final_missing_df[final_missing_df['Difference Count'] < 0]['Difference abs Count'].sum()]

In [42]:
df

Unnamed: 0,Date,Total count of records,Total charge of records,Percentage of duplicate records,Number of duplicate records,Percentage of missing records,Number of missing records,Number of missing records in report,Number of missing records in snowflake
0,07/30/2023,57728,2101863.01,0.221729,128,69.683689,40227.0,40222.0,5.0


In [43]:
df.to_csv("result.csv")