In [1]:
import os
import pandas as pd

In [2]:
# Full path to the folder containing your CSV files
folder_path = r"C:\Users\vig10\OneDrive\Desktop\Final project"

In [3]:
# Dictionary to store DataFrames
dfs = {}

# Iterate over each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Check if the file is a CSV file
        # Read the CSV file into a DataFrame
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        # Store the DataFrame in the dictionary with the file name as part of the key
        dfs[file_name.split('.')[0]] = df  # Use split to remove the '.csv' extension

# Display the shape and basic info of each DataFrame
for file_name, df in dfs.items():
    print(f"File: {file_name}")
    print(f"Shape: {df.shape}")
    print(df.info())  # Basic information about the DataFrame
    print("=" * 50)  # Separating line for clarity

File: olist_customers_dataset
Shape: (99441, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
None
File: olist_geolocation_dataset
Shape: (1000163, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   ge

In [4]:
# Data cleaning checks for each DataFrame
for file_name, df in dfs.items():
    print(f"File: {file_name}")
    print(f"Shape: {df.shape}")
    print(df.info())  # Basic information about the DataFrame
    print("=" * 50)  # Separating line for clarity

    # Data Completeness Check
    print("Data Completeness Check:")
    print(df.isnull().sum())  # Check for missing values

    # Duplicates Check
    print("Duplicates Check:")
    print("Number of duplicate rows:", df.duplicated().sum())  # Check for duplicates



File: olist_customers_dataset
Shape: (99441, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
None
Data Completeness Check:
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
Duplicates Check:
Number of duplicate rows: 0
File: olist_geolocation_dataset
Shape: (1000163, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column  

In [5]:
# Data Completeness Check
order_items_missing_values = dfs['olist_order_items_dataset'].isnull().sum()
print("Missing values in olist_order_items_dataset:")
print(order_items_missing_values)


Missing values in olist_order_items_dataset:
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64


In [6]:
# Data Completeness Check
order_payments_missing_values = dfs['olist_order_payments_dataset'].isnull().sum()
print("Missing values in olist_order_payments_dataset:")
print(order_payments_missing_values)


Missing values in olist_order_payments_dataset:
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64


In [7]:
# Duplicates Check
order_items_duplicates = dfs['olist_order_items_dataset'].duplicated().sum()
print("Number of duplicate rows in olist_order_items_dataset:", order_items_duplicates)


Number of duplicate rows in olist_order_items_dataset: 0


In [8]:
# Duplicates Check
order_payments_duplicates = dfs['olist_order_payments_dataset'].duplicated().sum()
print("Number of duplicate rows in olist_order_payments_dataset:", order_payments_duplicates)


Number of duplicate rows in olist_order_payments_dataset: 0


In [9]:
# Outliers Check with olist_orders_dataset
order_items_orders_check = dfs['olist_order_items_dataset']['order_id'].isin(dfs['olist_orders_dataset']['order_id']).all()
print("All order_id values in olist_order_items exist in olist_orders:", order_items_orders_check)


All order_id values in olist_order_items exist in olist_orders: True


In [10]:
# Outliers Consistency Check with olist_products_dataset
order_items_products_check = dfs['olist_order_items_dataset']['product_id'].isin(dfs['olist_products_dataset']['product_id']).all()
print("All product_id values in olist_order_items exist in olist_products:", order_items_products_check)


All product_id values in olist_order_items exist in olist_products: True


In [11]:
# Outliers Consistency Check with olist_sellers_dataset
order_items_sellers_check = dfs['olist_order_items_dataset']['seller_id'].isin(dfs['olist_sellers_dataset']['seller_id']).all()
print("All seller_id values in olist_order_items exist in olist_sellers:", order_items_sellers_check)


All seller_id values in olist_order_items exist in olist_sellers: True


In [12]:
# Count the unique seller IDs in each dataset
order_items_unique_count = len(dfs['olist_order_items_dataset']['seller_id'].unique())
sellers_unique_count = len(dfs['olist_sellers_dataset']['seller_id'].unique())

# Check if all unique seller IDs in olist_order_items match unique seller IDs in olist_sellers
unique_ids_match = order_items_unique_count == sellers_unique_count

print("Count of unique seller IDs in olist_order_items:", order_items_unique_count)
print("Count of unique seller IDs in olist_sellers:", sellers_unique_count)
print("All unique seller IDs in olist_order_items match unique seller IDs in olist_sellers:", unique_ids_match)


Count of unique seller IDs in olist_order_items: 3095
Count of unique seller IDs in olist_sellers: 3095
All unique seller IDs in olist_order_items match unique seller IDs in olist_sellers: True


In [13]:
# Data Consistency Check with olist_orders_dataset including uniqueness
order_items_orders_check = dfs['olist_order_items_dataset']['order_id'].isin(dfs['olist_orders_dataset']['order_id']).all()
order_items_unique_orders_check = dfs['olist_order_items_dataset']['order_id'].nunique() == dfs['olist_orders_dataset']['order_id'].nunique()

print("All order_id values in olist_order_items exist in olist_orders:", order_items_orders_check)
print("Number of unique order IDs in olist_order_items matches olist_orders:", order_items_unique_orders_check)


All order_id values in olist_order_items exist in olist_orders: True
Number of unique order IDs in olist_order_items matches olist_orders: False


In [14]:
# Unique values of 'order_item_id'
unique_order_item_ids = dfs['olist_order_items_dataset']['order_item_id'].unique()

# Unique values of 'payment_sequential'
unique_payment_sequential = dfs['olist_order_payments_dataset']['payment_sequential'].unique()

print("Unique values of 'order_item_id':", unique_order_item_ids)
print("Unique values of 'payment_sequential':", unique_payment_sequential)


Unique values of 'order_item_id': [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]
Unique values of 'payment_sequential': [ 1  2  4  5  3  8  6  7 10 11 17 19 27 12  9 15 13 14 16 25 22 26 29 28
 18 21 24 23 20]


In [16]:
# Calculate the number of unique order IDs
unique_order_ids = dfs['olist_order_payments_dataset']['order_id'].nunique()
print("Number of unique order IDs in olist_order_payments_dataset:", unique_order_ids)


Number of unique order IDs in olist_order_payments_dataset: 99440


In [17]:
# Number of unique order IDs in olist_order_items_dataset
unique_order_ids_order_items = dfs['olist_order_items_dataset']['order_id'].nunique()

# Number of unique order IDs in olist_order_payments_dataset
unique_order_ids_payments = dfs['olist_order_payments_dataset']['order_id'].nunique()

print(f"Number of unique order IDs in olist_order_items_dataset: {unique_order_ids_order_items}")
print(f"Number of unique order IDs in olist_order_payments_dataset: {unique_order_ids_payments}")


Number of unique order IDs in olist_order_items_dataset: 98666
Number of unique order IDs in olist_order_payments_dataset: 99440


In [18]:
# Number of unique order IDs in olist_orders_dataset
unique_order_ids_orders = dfs['olist_orders_dataset']['order_id'].nunique()

print(f"Number of unique order IDs in olist_orders_dataset: {unique_order_ids_orders}")


Number of unique order IDs in olist_orders_dataset: 99441


In [19]:
# Total number of order IDs in olist_orders_dataset
total_order_ids = len(dfs['olist_orders_dataset']['order_id'])

print(f"Total number of order IDs in olist_orders_dataset: {total_order_ids}")


Total number of order IDs in olist_orders_dataset: 99441


In [20]:


# Get unique order IDs from order items and order payments datasets
order_ids_order_items = set(dfs['olist_order_items_dataset']['order_id'])
order_ids_payments = set(dfs['olist_order_payments_dataset']['order_id'])

# Find missing order IDs
missing_order_ids = order_ids_payments - order_ids_order_items

# Create a DataFrame to store missing order IDs
missing_order_ids_df = pd.DataFrame({'missing_order_id': list(missing_order_ids)})

# Save the DataFrame to a CSV file
missing_order_ids_df.to_csv('missing_order_ids.csv', index=False)

# Display the missing order IDs
print("Missing Order IDs:")
print(missing_order_ids_df)


Missing Order IDs:
                     missing_order_id
0    ef32bb24f4e81a29a305e5285c8d3d34
1    b90cc9e10252911c2092a1e49794aa13
2    186d3cd768be1890d832106f914ba37c
3    a0c1632c3bd45c48bed924a7dae3a664
4    ca9d82b594244464dddfac0959180268
..                                ...
770  a3777b94ef07749f031ade4ae824ddb2
771  0130f0f71fb0e831d18e6a3b33a3a50c
772  43c8d05a1478e794217ad3b39398022a
773  ea844c92cf978ea23321fa7fe5871761
774  791d454dd290baaf30d599c6183d7489

[775 rows x 1 columns]


In [21]:
# Get unique order IDs from order and order payments datasets
order_ids_orders = set(dfs['olist_orders_dataset']['order_id'])
order_ids_payments = set(dfs['olist_order_payments_dataset']['order_id'])

# Find the missing order ID (assuming there's only one)
missing_order_id = order_ids_orders-order_ids_payments

# Print the missing order ID
print("Missing Order ID:", missing_order_id)


Missing Order ID: {'bfbd0f9bdef84302105ad712db648a6c'}


In [15]:


# Merge order items and payments on order_id
merged_df = pd.merge(dfs['olist_order_items_dataset'], dfs['olist_order_payments_dataset'], on='order_id', how='inner')

# Calculate total payment value in olist_order_payments_dataset
total_payment = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_value'].sum()

# Calculate total order value in olist_order_items_dataset (sum of price and freight_value)
total_order_value = merged_df.groupby('order_id').apply(lambda x: (x['price'] + x['freight_value']).sum())

# Compare total payment value with total order value for each order ID
matched_count = 0
unmatched_count = 0
for order_id in merged_df['order_id'].unique():
    payment_value = total_payment.get(order_id, 0)
    order_value = total_order_value.get(order_id, 0)
    if payment_value == order_value:
        matched_count += 1
    else:
        unmatched_count += 1

print(f"Number of order IDs where Payment Value matches Order Value: {matched_count}")
print(f"Number of order IDs where Payment Value does not match Order Value: {unmatched_count}")


Number of order IDs where Payment Value matches Order Value: 76666
Number of order IDs where Payment Value does not match Order Value: 21999


In [22]:
# Merge order items and payments on order_id
merged_df = pd.merge(dfs['olist_order_items_dataset'], dfs['olist_order_payments_dataset'], on='order_id', how='inner')

# Calculate total payment value in olist_order_payments_dataset
total_payment = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_value'].sum()

# Calculate total order value in olist_order_items_dataset (sum of price and freight_value)
total_order_value = merged_df.groupby('order_id').apply(lambda x: (x['price'] + x['freight_value']).sum())

# Get the payment sequential count for each order
payment_sequential_count = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_sequential'].nunique()

# Compare total payment value with total order value for each order ID
unmatched_count = 0
for order_id in merged_df['order_id'].unique():
    payment_value = total_payment.get(order_id, 0)
    order_value = total_order_value.get(order_id, 0)
    if payment_value != order_value and payment_sequential_count.get(order_id, 0) > 1:
        unmatched_count += 1

print(f"Number of orders with unmatched payment and order values and payment sequential count > 1: {unmatched_count}")


Number of orders with unmatched payment and order values and payment sequential count > 1: 2936


In [23]:
# Merge order items and payments on order_id
merged_df = pd.merge(dfs['olist_order_items_dataset'], dfs['olist_order_payments_dataset'], on='order_id', how='inner')

# Calculate total payment value in olist_order_payments_dataset
total_payment = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_value'].sum()

# Calculate total order value in olist_order_items_dataset (sum of price and freight_value)
total_order_value = merged_df.groupby('order_id').apply(lambda x: (x['price'] + x['freight_value']).sum())

# Get the payment type for each order
payment_type = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_type'].first()

# Compare total payment value with total order value for each order ID
unmatched_payment_info = {}
for order_id in merged_df['order_id'].unique():
    payment_value = total_payment.get(order_id, 0)
    order_value = total_order_value.get(order_id, 0)
    if payment_value != order_value:
        payment_type_order = payment_type.get(order_id)
        if payment_type_order not in unmatched_payment_info:
            unmatched_payment_info[payment_type_order] = 1
        else:
            unmatched_payment_info[payment_type_order] += 1

print("Breakdown of payment type for unmatched payments:")
for payment_type, count in unmatched_payment_info.items():
    print(f"{payment_type}: {count}")


Breakdown of payment type for unmatched payments:
credit_card: 15984
boleto: 3947
voucher: 1759
debit_card: 309


In [24]:
# Merge order items and payments on order_id
merged_df = pd.merge(dfs['olist_order_items_dataset'], dfs['olist_order_payments_dataset'], on='order_id', how='inner')

# Calculate total payment value in olist_order_payments_dataset
total_payment = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_value'].sum()

# Calculate total order value in olist_order_items_dataset (sum of price and freight_value)
total_order_value = merged_df.groupby('order_id').apply(lambda x: (x['price'] + x['freight_value']).sum())

# Get the payment type for each order
payment_type = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_type'].first()

# Get the order status for each order
order_status = dfs['olist_orders_dataset'].set_index('order_id')['order_status']

# Get the delivery status for each order
delivery_status = dfs['olist_orders_dataset'].set_index('order_id')['order_status']

# Initialize counters for unmatched payments
unmatched_payment_count = 0

# Initialize dictionaries to store outcomes for each unmatched payment
unmatched_payment_outcomes = {'DELIVERY_STATUS': []}

# Compare total payment value with total order value for each order ID
for order_id in merged_df['order_id'].unique():
    payment_value = total_payment.get(order_id, 0)
    order_value = total_order_value.get(order_id, 0)
    if payment_value != order_value:
        unmatched_payment_count += 1
        order_status_order = order_status.get(order_id)
        delivery_status_order = delivery_status.get(order_id)
        unmatched_payment_outcomes['DELIVERY_STATUS'].append(delivery_status_order)
        
print(f"Number of unmatched payments: {unmatched_payment_count}")
print("Outcomes for unmatched payments:")
for outcome_type, outcomes in unmatched_payment_outcomes.items():
    outcome_counts = pd.Series(outcomes).value_counts()
    print(f"{outcome_type}:")
    for outcome, count in outcome_counts.items():
        print(f"{outcome}: {count}")


Number of unmatched payments: 21999
Outcomes for unmatched payments:
DELIVERY_STATUS:
delivered: 21514
shipped: 227
canceled: 113
invoiced: 84
processing: 60
approved: 1


In [25]:
# Merge order items and payments on order_id
merged_df = pd.merge(dfs['olist_order_items_dataset'], dfs['olist_order_payments_dataset'], on='order_id', how='inner')

# Calculate total payment value in olist_order_payments_dataset
total_payment = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_value'].sum()

# Calculate total order value in olist_order_items_dataset (sum of price and freight_value)
total_order_value = merged_df.groupby('order_id').apply(lambda x: (x['price'] + x['freight_value']).sum())

# Get the count of items for each order ID
order_item_counts = dfs['olist_order_items_dataset']['order_id'].value_counts()

# Initialize counter for unmatched payments and orders appearing more than once
unmatched_payment_count = 0
orders_appearing_more_than_once = 0

# Compare total payment value with total order value for each order ID
for order_id in merged_df['order_id'].unique():
    payment_value = total_payment.get(order_id, 0)
    order_value = total_order_value.get(order_id, 0)
    if payment_value != order_value:
        unmatched_payment_count += 1
        if order_item_counts.get(order_id, 0) > 1:
            orders_appearing_more_than_once += 1

print(f"Number of unmatched payments: {unmatched_payment_count}")
print(f"Number of orders appearing more than once in the order list with unmatched payments: {orders_appearing_more_than_once}")


Number of unmatched payments: 21999
Number of orders appearing more than once in the order list with unmatched payments: 2745


In [26]:
# Merge order items and payments on order_id
merged_df = pd.merge(dfs['olist_order_items_dataset'], dfs['olist_order_payments_dataset'], on='order_id', how='inner')

# Calculate total payment value in olist_order_payments_dataset
total_payment = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_value'].sum()

# Calculate total order value in olist_order_items_dataset (sum of price and freight_value)
total_order_value = merged_df.groupby('order_id').apply(lambda x: (x['price'] + x['freight_value']).sum())

# Initialize counter for unmatched payments and unmatched payments with payment_installments > 1
unmatched_payment_count = 0
unmatched_payment_installments_count = 0

# Compare total payment value with total order value for each order ID
for order_id in merged_df['order_id'].unique():
    payment_value = total_payment.get(order_id, 0)
    order_value = total_order_value.get(order_id, 0)
    if payment_value != order_value:
        unmatched_payment_count += 1
        if dfs['olist_order_payments_dataset'].loc[dfs['olist_order_payments_dataset']['order_id'] == order_id, 'payment_installments'].iloc[0] > 1:
            unmatched_payment_installments_count += 1

print(f"Number of unmatched payments: {unmatched_payment_count}")
print(f"Number of unmatched payments with payment_installments > 1: {unmatched_payment_installments_count}")


Number of unmatched payments: 21999
Number of unmatched payments with payment_installments > 1: 10367


In [27]:
# Filter payments where payment_installments < 1
filtered_payments = dfs['olist_order_payments_dataset'][dfs['olist_order_payments_dataset']['payment_installments'] < 1]

# Get the count of payments with payment_installments < 1
count_payment_installments_less_than_1 = filtered_payments.shape[0]

print("Number of payment_installments < 1:", count_payment_installments_less_than_1)


Number of payment_installments < 1: 2


In [28]:
# Merge order items and payments on order_id
merged_df = pd.merge(dfs['olist_order_items_dataset'], dfs['olist_order_payments_dataset'], on='order_id', how='inner')

# Calculate total payment value in olist_order_payments_dataset
total_payment = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_value'].sum()

# Calculate total order value in olist_order_items_dataset (sum of price and freight_value)
total_order_value = merged_df.groupby('order_id').apply(lambda x: (x['price'] + x['freight_value']).sum())

# Initialize counter for unmatched payments and unmatched payments with payment_installments >= 1
unmatched_payment_count = 0
unmatched_payment_installments_count = 0

# Compare total payment value with total order value for each order ID
for order_id in merged_df['order_id'].unique():
    payment_value = total_payment.get(order_id, 0)
    order_value = total_order_value.get(order_id, 0)
    if payment_value != order_value:
        unmatched_payment_count += 1
        if dfs['olist_order_payments_dataset'].loc[dfs['olist_order_payments_dataset']['order_id'] == order_id, 'payment_installments'].iloc[0] >= 1:
            unmatched_payment_installments_count += 1

print(f"Number of unmatched payments: {unmatched_payment_count}")
print(f"Number of unmatched payments with payment_installments >= 1: {unmatched_payment_installments_count}")


Number of unmatched payments: 21999
Number of unmatched payments with payment_installments >= 1: 21999


In [29]:
# Merge order items and payments on order_id
merged_df = pd.merge(dfs['olist_order_items_dataset'], dfs['olist_order_payments_dataset'], on='order_id', how='inner')

# Calculate total payment value in olist_order_payments_dataset
total_payment = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_value'].sum()

# Calculate total order value in olist_order_items_dataset (sum of price and freight_value)
total_order_value = merged_df.groupby('order_id').apply(lambda x: (x['price'] + x['freight_value']).sum())

# Initialize counter for matched and unmatched payments with payment_installments > 1 and < 1
matched_payment_installments_gt1 = 0
unmatched_payment_installments_gt1 = 0
matched_payment_installments_lt1 = 0
unmatched_payment_installments_lt1 = 0

# Compare total payment value with total order value for each order ID
for order_id in merged_df['order_id'].unique():
    payment_value = total_payment.get(order_id, 0)
    order_value = total_order_value.get(order_id, 0)
    if payment_value == order_value:
        if dfs['olist_order_payments_dataset'].loc[dfs['olist_order_payments_dataset']['order_id'] == order_id, 'payment_installments'].iloc[0] >= 1:
            matched_payment_installments_gt1 += 1
        else:
            matched_payment_installments_lt1 += 1
    else:
        if dfs['olist_order_payments_dataset'].loc[dfs['olist_order_payments_dataset']['order_id'] == order_id, 'payment_installments'].iloc[0] >= 1:
            unmatched_payment_installments_gt1 += 1
        else:
            unmatched_payment_installments_lt1 += 1

print(f"Number of matched payments with payment_installments >= 1: {matched_payment_installments_gt1}")
print(f"Number of matched payments with payment_installments < 1: {matched_payment_installments_lt1}")
print(f"Number of unmatched payments with payment_installments >= 1: {unmatched_payment_installments_gt1}")
print(f"Number of unmatched payments with payment_installments < 1: {unmatched_payment_installments_lt1}")


Number of matched payments with payment_installments >= 1: 76664
Number of matched payments with payment_installments < 1: 2
Number of unmatched payments with payment_installments >= 1: 21999
Number of unmatched payments with payment_installments < 1: 0


In [31]:
import pandas as pd

# Merge order items and payments on order_id
merged_df = pd.merge(dfs['olist_order_items_dataset'], dfs['olist_order_payments_dataset'], on='order_id', how='inner')

# Calculate total payment value in olist_order_payments_dataset
total_payment = dfs['olist_order_payments_dataset'].groupby('order_id')['payment_value'].sum()

# Calculate total order value in olist_order_items_dataset (sum of price and freight_value)
total_order_value = merged_df.groupby('order_id').apply(lambda x: (x['price'] + x['freight_value']).sum())

# Initialize an empty list to store differences
differences = []

# Compare total payment value with total order value for each order ID
for order_id in merged_df['order_id'].unique():
    payment_value = total_payment.get(order_id, 0)
    order_value = total_order_value.get(order_id, 0)
    if payment_value != order_value:
        difference = payment_value - order_value
        differences.append({'order_id': order_id, 'difference': difference})

# Create a DataFrame from the list of differences
differences_df = pd.DataFrame(differences)

# Sort the differences DataFrame based on the "difference" column
differences_df = differences_df.sort_values(by='difference')

# Display head and tail of differences DataFrame
print("Head of differences DataFrame:")
print(differences_df.head())

print("\nTail of differences DataFrame:")
print(differences_df.tail())


Head of differences DataFrame:
                               order_id  difference
21507  fa65dad1b0e818e3ccc5cb0e39231352   -12823.72
6513   4bfcba9e084f46c8e3cb49b0fa6e6159   -10370.64
5989   465c2e1bee4561cb39e0db8c5993aafc    -9673.84
21008  f489949dbe23cf9313deb342913ece0c    -8618.76
6009   4689b1816de42507a7d63a4617383c59    -6884.15

Tail of differences DataFrame:
                               order_id  difference
9638   70b7e94ea46d3e8b5bc12a50186edaf0       61.69
13069  996c7e73600ad3723e8627ab7bef81e4       76.53
9637   70b742795bc441e94a44a084b6d9ce7a      111.89
9431   6e5fe7366a2e1bfbf3257dba0af1267f      119.01
17729  ce6d150fb29ada17d2082f4847107665      182.81


#key summary 

There are 775 missing entries in the olist order_items (take note the one missing id in olist order_payment is present in here)
There is one missing id/entry in the olist order_payment - bfbd0f9bdef84302105ad712db648a6c

The payments tally for 76666
The payments do not tally for 21999

So far data validation checks have shown that: 
The unmatched payment tally were not influenced by factors like interest from installment, the sequential modes of payment, the different payment type made by customer or due to the number of items being purchased. 

Further analysis showed that differences were not minute and were pretty large. 

Shipping Limit date will be dropped from the olist order_items

In [32]:
from sqlalchemy import create_engine

# Define the database connection parameters
db_params = {
    "host": "localhost",
    "dbname": "Fp",       # Replace with your desired database name
    "user": "postgres",       # Replace with your PostgreSQL username
    "password": "admin",      # Replace with your PostgreSQL password
    "port": "5432"            # Replace with your PostgreSQL port
}

# Construct the SQLAlchemy connection string using db_params
connection_string = f"postgresql+psycopg2://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"

# Create the engine
engine = create_engine(connection_string)

In [33]:
#Unnecessary column dropped

dfs['olist_order_items_dataset'] = dfs['olist_order_items_dataset'].drop(columns=['shipping_limit_date'])


In [35]:
#Transfer the DataFrames to your PostgreSQL database
dfs['olist_order_items_dataset'].to_sql('olist_order_items', engine, if_exists='replace', index=False)
dfs['olist_order_payments_dataset'].to_sql('olist_order_payments', engine, if_exists='replace', index=False)
print("Data transfer completed successfully!")

Data transfer completed successfully!
