In [8]:
import getpass
import pandas as pd
import json
from get_secret import get_secret
import snowflake.connector

# Import and clean snowflake views

In [22]:
config_json = json.loads(get_secret())
config_json.update({'warehouse':'COMPUTE_WH', 'database':'ANALYTICS_PROD','schema': 'IOATAWARE',"loglevel":'DEBUG'})

# Create a connection object
conn = snowflake.connector.connect(**config_json)
# Create a cursor object to execute SQL queries
cur = conn.cursor()

# Execute a sample SQL query
query = """
select * from ANALYTICS_PROD.IOATAWARE.VW_AIF_REPORT_DETAIL where "Flight Date"='2023/06/03' and "Flight ID" = 111554
"""
# query = """
# select * from ANALYTICS_PROD.IOATAWARE.VW_AIF_REPORT_DETAIL limit 5
# """
cur.execute(query)

# Fetch the results
results_df = cur.fetch_pandas_all()

In [26]:
snowflake_pd_df = results_df

In [27]:
snowflake_pd_df['combined_timestamp'] = pd.to_datetime(snowflake_pd_df['Flight Date'] + ' ' + snowflake_pd_df['Flight time'])
snowflake_pd_df['combined_timestamp_mt'] = snowflake_pd_df['combined_timestamp'].dt.tz_localize('UTC').dt.tz_convert('US/Mountain')
snowflake_pd_df['Flight Date'] = snowflake_pd_df['combined_timestamp_mt'].dt.date

In [29]:
#Remove the columns we are not testing --Charge Date
columns_to_remove = ['First Name','Flight ID', 'Last Name','Charge Date', 'combined_timestamp','combined_timestamp_mt','Flight time']
snowflake_pd_df = snowflake_pd_df.drop(columns=columns_to_remove)
snowflake_pd_df['Flight Date'] = pd.to_datetime(snowflake_pd_df['Flight Date'])
snowflake_pd_df['Reservation Number'] = snowflake_pd_df['Reservation Number'].astype(int)
snowflake_pd_df['Leg Number'] = snowflake_pd_df['Leg Number'].astype(int)
snowflake_pd_df['Net Charge'] = snowflake_pd_df['Net Charge'].astype(float)

In [30]:
snowflake_pd_df.dtypes

Flight Date           datetime64[ns]
Departure                     object
Arrival                       object
Reservation Number             int32
Leg Number                     int32
PNR                           object
Check In Boarded              object
Status                        object
Charge Type                   object
Net Charge                   float64
Leg Start                     object
Leg End                       object
dtype: object

# Import and clean report data

In [31]:
report_df = pd.read_csv('AIF_DETAIL/AIF_Report_Detail_111554.csv', skiprows=3)   

In [32]:
columns_to_remove = ['str_First_Name', 'str_Last_Name','ChargeDate']
report_df = report_df.drop(columns=columns_to_remove)

report_df['FlightDate1'] = pd.to_datetime(report_df['FlightDate1'])
report_df['lng_Reservation_Nmbr'] = report_df['lng_Reservation_Nmbr'].astype(int)
report_df['lng_Leg_Nmbr'] = report_df['lng_Leg_Nmbr'].astype(int)
report_df['NetCharge'] = report_df['NetCharge'].str.replace('$', '')
report_df['NetCharge'] = report_df['NetCharge'].str.replace(')', '')
report_df['NetCharge'] = report_df['NetCharge'].str.replace('(', '-')
report_df['NetCharge'] = report_df['NetCharge'].astype(float)

# Compare two dataframes

In [35]:
snowflake_pd_df.columns = report_df.columns

In [36]:
merged = snowflake_pd_df.merge(report_df, how = 'outer', indicator = True)
snowflake_only = merged[merged['_merge'] == "left_only"]
common = merged[merged['_merge'] == "both"]
report_only = merged[merged['_merge'] == "right_only"]

In [41]:
def calculate_df2_accuracy(df1, df2):
    merged = df1.merge(df2, how = 'outer', indicator = True)
    df1_only = merged[merged['_merge'] == "left_only"]
    df2_only = merged[merged['_merge'] == "right_only"]


    # Calculate accuracy as a percentage
    accuracy_df1 = ((len(df1) - len(df1_only)) / len(df1)) * 100
    accuracy_df2 = ((len(df2) - len(df2_only)) / len(df2)) * 100

    return accuracy_df1,accuracy_df2,df1_only,df2_only


In [42]:
report_accuracy,snowflake_accuracy,report_only,snowflake_only = calculate_df2_accuracy(report_df,snowflake_pd_df)

In [43]:
print(f"Accuracy: {snowflake_accuracy:.2f}%" " snowflake records in report\nThe number of records in snowflake but not in report is", len(snowflake_only))
print(f"Accuracy: {report_accuracy:.2f}%"" report records in snowflake\nThe number of records in report but not in snowflake is", len(report_only))

Accuracy: 100.00% snowflake records in report
The number of records in snowflake but not in report is 0
Accuracy: 100.00% report records in snowflake
The number of records in report but not in snowflake is 0


# Output the difference file

In [29]:
try:
    snowflake_only.to_csv("snowflake_only_record.csv", index=False) 
except:
    print("No snowflake only records")
try:
    report_only.to_csv("report_only_record.csv", index=False) 
except:
    print("No report only records")