## Accuracy: 98.33% 

Things to note:
1. purchase cnt/Ancillary Category/Resident Base Discount different
2. "Sked Detail Id Nmbr" is null

In [4]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import col, lit
import getpass
import pandas as pd
import json
from get_secret import get_secret
from datetime import datetime
import snowflake.snowpark.functions as F

In [5]:
connection_parameters = json.loads(get_secret())
connection_parameters.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})
session = Session.builder.configs(connection_parameters).create()

In [6]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "ANALYTICS_PROD"."IOATAWARE"
Current Warehouse: "COMPUTE_WH"


# Import and clean snowflake views

In [7]:
view_name = "VW_SALES_RAW_DATA"
charge_date = "07/30/2023"

In [8]:
snowpark_df = session.table(view_name)

In [9]:
snowflake_pd_df = snowpark_df.filter(F.col('Charge Date') == charge_date ).to_pandas()
len(snowflake_pd_df)
#snowflake_pd_df2 = snowpark_df.filter(F.col('Flight Date') == flight_date2 ).to_pandas() 

57905

In [10]:
#snowflake_pd_df = pd.concat([snowflake_pd_df1,snowflake_pd_df2]).reset_index()

In [11]:
snowflake_pd_df = snowflake_pd_df[snowflake_pd_df[["Sked Detail Id Nmbr"]].notna().any(axis=1)]
len(snowflake_pd_df)

57746

In [12]:
#snowflake_pd_df[snowflake_pd_df['Reservation Nmbr'] == 8291127]

In [15]:
#Remove the column we are not testing for now 'Sales Username'

In [16]:
#need change in the future 'Purchase Cnt','Ancillary Category','Resident Base Taxes'
columns_to_remove = ['\tPercent of Full Leg','Leg Status','Flight Time','Sales Username']
snowflake_pd_df = snowflake_pd_df.drop(columns=columns_to_remove)

snowflake_pd_df['Charge Date'] = pd.to_datetime(snowflake_pd_df['Charge Date'])
snowflake_pd_df['Flight Date'] = pd.to_datetime(snowflake_pd_df['Flight Date'])
snowflake_pd_df['Net Charge'] = snowflake_pd_df['Net Charge'].astype(float)
snowflake_pd_df['Taxes'] = snowflake_pd_df['Taxes'].astype(float)
snowflake_pd_df['Total Charge'] = snowflake_pd_df['Total Charge'].astype(float)
snowflake_pd_df['Flight Nmbr'] = snowflake_pd_df['Flight Nmbr'].astype(int)

In [17]:
snowflake_pd_df.dtypes

Charge Date               datetime64[ns]
Reservation Nmbr                   int32
Sked Detail Id Nmbr              float64
Flight Date               datetime64[ns]
Departure                         object
Arrival                           object
Legs Id Nmbr                       int32
Charge Type                        int16
Net Charge                       float64
Taxes                            float64
Total Charge                     float64
Charge Type Desc                  object
Flight Nmbr                        int32
Charges Desc                      object
User Id Nmbr                       int16
Leg Nmbr                         float64
Segments Id Nmbr                 float64
Agency Id Nmbr                   float64
Reference                         object
Agency Name                       object
Transborder                       object
Category                          object
Ancillary Category                object
Purchase Cnt                        int8
Classification  

# Import and clean report data

In [18]:
report_df = pd.read_csv('Sales/Sales_Raw_Data_0730.csv', skiprows=0)   

In [19]:
report_df.columns 

Index(['Source', 'ChargeDate', 'lng_Reservation_Nmbr',
       'lng_Sked_Detail_Id_Nmbr', 'FlightDate', 'ActualFlightDate',
       'Departure', 'Arrival', 'lng_Res_Legs_Id_Nmbr', 'ChargeType',
       'NetCharge', 'Taxes', 'mny_Tax_1_Percentage', 'TotalCharge',
       'str_GL_Charge_Type_Desc', 'str_Flight_Nmbr', 'Percent',
       'str_GL_Charges_Desc', 'lng_Creation_User_Id_Nmbr', 'lng_Leg_Nmbr',
       'lng_Res_Segments_Id_Nmbr', 'lng_Agency_Id_Nmbr', 'Reference',
       'str_Agency_Name', 'Transborder', 'str_Leg_Status', 'Category',
       'LastMod_SalesUser', 'AncillaryCategory', 'PurchaseCnt',
       'Classification', 'Channel', 'str_Currency_Ident1', 'Exchange_Rate',
       'Base_Charge', 'Base_Discount', 'Base_Taxes', 'CancellationDate',
       'PaxStatus', 'ChargeStatus', 'SeatSold'],
      dtype='object')

In [20]:
def process_fee_column(value):
    value = value.replace('$', '')  # Remove $
    value = value.replace(')', '')   # Remove )
    value = value.replace('(', '-')   # Replace ( with -
    return float(value)

In [21]:
#'LastMod_SalesUser''PurchaseCnt','AncillaryCategory','Base_Taxes'
columns_to_remove = ['Source','mny_Tax_1_Percentage','ActualFlightDate','Percent','str_Leg_Status','LastMod_SalesUser','CancellationDate','PaxStatus', 'ChargeStatus', 'SeatSold']
report_df = report_df.drop(columns=columns_to_remove)

columns_to_process = ['NetCharge', 'Taxes','TotalCharge','Base_Charge','Base_Discount','Base_Taxes']

# Apply the function to specified columns in the dataframe
for col in columns_to_process:
    report_df[col] = report_df[col].apply(process_fee_column)

report_df['ChargeDate'] = pd.to_datetime(report_df['ChargeDate'])
report_df['ChargeDate'] =report_df['ChargeDate'].dt.strftime('%m/%d/%Y')
report_df['ChargeDate'] = pd.to_datetime(report_df['ChargeDate'])
report_df['FlightDate'] = pd.to_datetime(report_df['FlightDate'])
report_df['FlightDate'] =report_df['FlightDate'].dt.strftime('%m/%d/%Y')
report_df['FlightDate'] = pd.to_datetime(report_df['FlightDate'])

  report_df['ChargeDate'] = pd.to_datetime(report_df['ChargeDate'])


In [22]:
len(report_df)

57728

In [23]:
report_df.dtypes

ChargeDate                   datetime64[ns]
lng_Reservation_Nmbr                  int64
lng_Sked_Detail_Id_Nmbr               int64
FlightDate                   datetime64[ns]
Departure                            object
Arrival                              object
lng_Res_Legs_Id_Nmbr                  int64
ChargeType                            int64
NetCharge                           float64
Taxes                               float64
TotalCharge                         float64
str_GL_Charge_Type_Desc              object
str_Flight_Nmbr                       int64
str_GL_Charges_Desc                  object
lng_Creation_User_Id_Nmbr             int64
lng_Leg_Nmbr                          int64
lng_Res_Segments_Id_Nmbr              int64
lng_Agency_Id_Nmbr                    int64
Reference                            object
str_Agency_Name                      object
Transborder                          object
Category                             object
AncillaryCategory               

In [24]:
#report_df.dtypes = snowflake_pd_df.dtypes
report_df.columns = snowflake_pd_df.columns 

# Compare two dataframes

In [26]:
def calculate_df2_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    #matched = merged[merged['_merge'] == "both"]
    df1_only = merged[merged['_merge'] == "left_only"]
    df2_only = merged[merged['_merge'] == "right_only"]

    # Calculate the total number of elements
    total_elements = len(df2)

    # Calculate accuracy as a percentage
    accuracy = ((total_elements - len(df2_only)) / total_elements) * 100

    return accuracy


In [27]:
accuracy_percentage = calculate_df2_accuracy(report_df,snowflake_pd_df)
print(f"Accuracy: {accuracy_percentage:.2f}%" " snowflake records in report\nThe number of records in snowflake but not in report is", len(snowflake_only))

Accuracy: 94.16% snowflake records in report
The number of records in snowflake but not in report is 3375


In [28]:
accuracy_percentage = calculate_df2_accuracy(snowflake_pd_df, report_df)
print(f"Accuracy: {accuracy_percentage:.2f}%"" report records in snowflake\nThe number of records in report but not in snowflake is", len(report_only))

Accuracy: 94.19% report records in snowflake
The number of records in report but not in snowflake is 3356


# Output the difference file

In [29]:
try:
    snowflake_only.to_csv("Sales/"+"snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("Sales/"+"report_only_record.csv", index=False) 
except:
    print("No report only records")

In [27]:
snow_res = list(snowflake_pd_df['Reservation Nmbr'].unique())
report_res = list(report_df['Reservation Nmbr'].unique())

In [28]:
elements_in_snow_not_in_report = list(set(snow_res) - set(report_res))
elements_in_report_not_in_snow = list(set(report_res)-set(snow_res))

snow_file_path = "Sales/snowonly_reservation.txt"
report_file_path = "Sales/reportonly_reservation.txt"

# Open the file for writing
with open(snow_file_path, "w") as file:
    for item in elements_in_snow_not_in_report:
        file.write("%s," % item)
with open(report_file_path, "w") as file:
    for item in elements_in_snow_not_in_report:
        file.write("%s," % item)