In [251]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret
from datetime import datetime
import snowflake.snowpark.functions as F

In [252]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
session = Session.builder.configs(connection_parameters).create()

In [253]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "ANALYTICS_PROD"."IOATAWARE"
Current Warehouse: "COMPUTE_WH"


In [254]:
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore', category=Warning)

# For snowflake views

In [255]:
view_name = "VW_REVENUE_RAW_DATA"
flight_date1 = "07/30/2023"
flight_date2 = "07/31/2023"

In [256]:
snowpark_df = session.table(view_name)

In [257]:
snowflake_pd_df1 = snowpark_df.filter(F.col('Flight Date') == flight_date1).to_pandas()
snowflake_pd_df2 = snowpark_df.filter(F.col('Flight Date') == flight_date2).to_pandas() 
snowflake_pd_df = pd.concat([snowflake_pd_df1,snowflake_pd_df2]).reset_index()

In [258]:
snowflake_pd_df['Flight Date'] = snowflake_pd_df['Flight MST Date'].dt.date
snowflake_pd_df['Flight Date'] = pd.to_datetime(snowflake_pd_df['Flight Date'], format='%Y-%m-%d')
snowflake_pd_df = snowflake_pd_df[snowflake_pd_df['Flight Date'] == "2023-07-30"]

# Check the Duplicates for snowflake

In [259]:
columns_check = ['Legs Id Nmbr','Charge Date','Total Charge' ,'Charges Desc']

In [260]:
duplicate_mask = snowflake_pd_df.duplicated(subset=columns_check, keep=False)

# Filter the DataFrame to show only the duplicate rows
duplicates_df = snowflake_pd_df[duplicate_mask]

In [261]:
#duplicates_df.to_csv("Duplicate_records.csv")

In [262]:
print( "Duplicate records percentage: ",(len(duplicates_df)/len(snowflake_pd_df))*100,"%")

Duplicate records percentage:  2.855145656320444 %


# For report

In [263]:
report_df = pd.read_csv('Revenue/Revenue_Raw_Data_0730.csv', skiprows=0)   

In [264]:
#report_df.columns

In [265]:
report_select = report_df[['lng_Reservation_Nmbr','TotalCharge']]

report_select['TotalCharge'] = report_select['TotalCharge'].str.replace('$', '')
report_select['TotalCharge'] = report_select['TotalCharge'].str.replace(')', '')
report_select['TotalCharge'] = report_select['TotalCharge'].str.replace('(', '-')

report_select['TotalCharge'] = report_select['TotalCharge'].astype(float)

In [266]:
iot_count_df = report_select.groupby(by =['lng_Reservation_Nmbr']).size().to_frame()
iot_sum_df = report_select.groupby(by =['lng_Reservation_Nmbr']).sum()

In [267]:
iot_df = iot_sum_df.join(iot_count_df, on = 'lng_Reservation_Nmbr', how= 'left',lsuffix='sum')

In [268]:
iot_df.columns = ['TotalCharge','Count']

# Check duplicates for report

In [270]:
columns_check = ['lng_Res_Legs_Id_Nmbr','ChargeDate','TotalCharge' ,'str_GL_Charges_Desc']

In [271]:
duplicate_mask = report_df.duplicated(subset=columns_check, keep=False)

# Filter the DataFrame to show only the duplicate rows
duplicates_df = report_df[duplicate_mask]

In [272]:
#duplicates_df

In [45]:
#duplicates_df.to_csv("Duplicate_report_records.csv")

In [273]:
print( "Duplicate records percentage: ",(len(duplicates_df)/len(report_df))*100,"%")

Duplicate records percentage:  1.0773604668562022 %


# Missing records

In [275]:
snow_df = snowflake_pd_df[['Reservation Nmbr','Total Charge']]

snow_df['Total Charge'] = snow_df['Total Charge'].astype(float)
snow_count_df = snow_df.groupby(by =['Reservation Nmbr']).size().to_frame()
snow_sum_df = snow_df.groupby(by =['Reservation Nmbr']).sum()

snow_result_df = snow_sum_df.join(snow_count_df, on = 'Reservation Nmbr', how= 'left',lsuffix='sum')

snow_result_df.columns = ['Total Charge','Count']
snow_result_df['Total Charge'] = round(snow_result_df['Total Charge'],2)

In [276]:
snow_result_df['Reservation Number'] = snow_result_df.index
snow_result_df = snow_result_df.reset_index(drop=True)

In [277]:
iot_df['Reservation Number'] = iot_df.index
iot_df = iot_df.reset_index(drop=True)
iot_df.columns = snow_result_df.columns

In [282]:
iot_df['Total Charge'] = iot_df['Total Charge'].round(2)
snow_result_df['Total Charge'] = snow_result_df['Total Charge'].round(2)

In [283]:
iot_df = iot_df.fillna(0)
snow_result_df = snow_result_df.fillna(0)
#snow_result_df.dtypes =  snow_result_df.astype(iot_df.dtypes) 

In [284]:
left_join_df =  pd.merge(snow_result_df, iot_df, on='Reservation Number', how='left',suffixes=('_snowflake', '_iot'))

In [287]:
left_join_df ['Difference Charge'] = left_join_df['Total Charge_snowflake'] - left_join_df['Total Charge_iot']
left_join_df ['Difference Count'] = left_join_df['Count_snowflake'] - left_join_df['Count_iot']

In [288]:
missing = left_join_df[left_join_df['Difference Charge'] != 0]

In [289]:
reservation_number = duplicates_df['lng_Reservation_Nmbr'].unique()

In [290]:
final_df = missing[~missing['Reservation Number'].isin(reservation_number)]

In [291]:
#checking with Jeremy what's table is left join
final_df

Unnamed: 0,Total Charge_snowflake,Count_snowflake,Reservation Number,Total Charge_iot,Count_iot,Difference Charge,Difference Count
3,189.46,30,2656739,90.01,8.0,99.45,22.0
7,160.81,10,3485044,,,,
8,507.27,42,3598360,320.21,18.0,187.06,24.0
9,474.27,60,3598556,193.68,24.0,280.59,36.0
13,0.00,110,4049975,-1034.90,85.0,1034.90,25.0
...,...,...,...,...,...,...,...
17759,0.00,10,8831540,,,,
17760,0.00,6,8831561,,,,
17769,0.00,20,8831842,,,,
17777,0.00,8,8832374,,,,
