<h1>Data Cleaning</h1>

# Imports

In [None]:
import sys
from pathlib import Path

import pandas as pd

sys.path.insert(0, r"C:\Users\vynde\PycharmProjects\dataanalysis")
from databridger import Database

# Issue Tracking

In [106]:
class IssueTracker:
    
    def __init__(self):
        self.issues = []
    
    def add_issue(self, description, impact, potential_cause=None, relevant_data=None, notes=None):
        issue = {
            "ID": f"Issue{len(self.issues)+1}",
            "Description": description,
            "Status": "Open",
            "Resolution": None,
            "Impact": impact,
            "Potential Cause": potential_cause,
            "Relevant Data": relevant_data,
            "Timestamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            "Notes": notes
        }
        self.issues.append(issue)
    
    def resolve_issue(self, issue_id, resolution):
        for issue in self.issues:
            if issue["ID"] == issue_id:
                issue["Status"] = "Resolved"
                issue["Resolution"] = resolution
                break
    
    def display_issues(self):
        df = pd.DataFrame(self.issues)
        display(df)

# create instance
issue_tracker = IssueTracker()

# Resources

In [2]:
# folders
raw_data_folder = Path('..') / 'data' / 'raw'
report_folder = Path('..') / 'reports'

# output files
database_summary_file = report_folder / "database_summary.xlsx"
overlap_ratio_file = report_folder / "column_overlap_ratios.xlsx"

---

# Load Data

New method for loading file based data base

In [None]:
db_raw = Database("csv", raw_data_folder)
db_raw.table_mapping

Inspect file names

In [67]:
for csv_file in raw_data_folder.glob('*.csv'):
    print(csv_file)

..\data\raw\olist_customers_dataset.csv
..\data\raw\olist_geolocation_dataset.csv
..\data\raw\olist_orders_dataset.csv
..\data\raw\olist_order_items_dataset.csv
..\data\raw\olist_order_payments_dataset.csv
..\data\raw\olist_order_reviews_dataset.csv
..\data\raw\olist_products_dataset.csv
..\data\raw\olist_sellers_dataset.csv
..\data\raw\product_category_name_translation.csv


Load all data sets into a database (dictionary of dataframes)

In [70]:
db = dict()  # database
for csv_file in raw_data_folder.glob('*.csv'):
    dataset_name = str(csv_file.stem).replace("olist_", "").replace("_dataset", "")
    db[dataset_name] = pd.read_csv(csv_file)

db.keys()

dict_keys(['customers', 'geolocation', 'orders', 'order_items', 'order_payments', 'order_reviews', 'products', 'sellers', 'product_category_name_translation'])

Convert to datetime matching this format yyyy-mm-dd HH:MM:SS

In [48]:
for key in db:
    for column in db[key].columns:
        if db[key][column].dtype == 'object':
            if all(db[key][column].str.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}")):
                db[key][column] = pd.to_datetime(db[key][column])
                print(f"converted:   {key:15} / {column:30} from object to {db[key][column].dtype}")

# also tried another version with try except trying to convert every object-type column to datetime
# showed a very strange behavior and messed up all types in the dataframe


converted:   orders          / order_purchase_timestamp       from object to datetime64[ns]
converted:   orders          / order_approved_at              from object to datetime64[ns]
converted:   orders          / order_delivered_carrier_date   from object to datetime64[ns]
converted:   orders          / order_delivered_customer_date  from object to datetime64[ns]
converted:   orders          / order_estimated_delivery_date  from object to datetime64[ns]
converted:   order_items     / shipping_limit_date            from object to datetime64[ns]
converted:   order_reviews   / review_creation_date           from object to datetime64[ns]
converted:   order_reviews   / review_answer_timestamp        from object to datetime64[ns]


> **Summary**: 
>- Loaded 9 CSV files into a dictionary, effectively creating a database.
>- Identified 8 columns as datetime columns

---

# Inspect Data

In [83]:
for name, df in db.items():
    print(f"{name}: {df.shape}")


customers: (99441, 5)
geolocation: (1000163, 5)
orders: (99441, 8)
order_items: (112650, 7)
order_payments: (103886, 5)
order_reviews: (99224, 7)
products: (32951, 9)
sellers: (3095, 4)
product_category_name_translation: (71, 2)


## Database Column Characterization & Classification

In [99]:
tables = [key for key in db for column in db2[key]]
columns = [column for key in db for column in db2[key]]
summary = []

for table, column in zip(tables, columns):
    series = db[table][column]
    data = {"table": table, "column": column}
    
    # Common metrics for all types
    data["count"] = len(series)
    unique_count = series.nunique()
    data["unique_count"] = unique_count

    # Determine if the column is a key
    if (unique_count == len(series)) \
    or (column.endswith("_id")):
        data["duplicated_count"] = len(series) - series.drop_duplicates().size
        data["type"] = "key"
    # Determine if the column is temporal
    elif series.dtype in ['datetime64[ns]', 'datetime64[ns, tz]']:
        data["min_date"] = series.min()
        data["max_date"] = series.max()
        data["range"] = series.max() - series.min()
        data["type"] = "temporal"
    # Determine if the column is numeric
    elif series.dtype in ['int64', 'float64']:
        data["min"] = series.min()
        data["max"] = series.max()
        data["mean"] = series.mean()
        data["type"] = "numeric"
    # Determine if the column is nominal (e.g. more than 10 unique text values)
    elif series.dtype == 'object' and unique_count > 10:
        mode_data = series.mode()
        data["mode"] = mode_data[0] if not mode_data.empty else None
        data["mode_count"] = (series == data["mode"]).sum()
        data["type"] = "nominal"
    # Determine if the column is categorical (e.g. 10 or fewer unique text values)
    elif series.dtype == 'object' and unique_count <= 10:
        mode_data = series.mode()
        data["mode"] = mode_data[0] if not mode_data.empty else None
        data["mode_count"] = (series == data["mode"]).sum()
        data["type"] = "categorical"
    else:
        data["type"] = "unknown"
    
    # Append the computed metrics for the column to the summary list
    summary.append(data)

# Compile the summary list into a DataFrame for a neat presentation
df_summary = pd.DataFrame(summary)

# Split the dataframes by type and drop nan columns
df_summaries = {name: df.dropna(axis=1).drop(columns="type") for name ,df in df_summary.groupby("type")}

In [103]:
df_summary

Unnamed: 0,table,column,count,unique_count,duplicated_count,type,min,max,mean,mode,mode_count
0,customers,customer_id,99441,99441,0.0,key,,,,,
1,customers,customer_unique_id,99441,96096,3345.0,key,,,,,
2,customers,customer_zip_code_prefix,99441,14994,,numeric,1003.0,99990.0,35137.474583,,
3,customers,customer_city,99441,4119,,nominal,,,,sao paulo,15540.0
4,customers,customer_state,99441,27,,nominal,,,,SP,41746.0
5,geolocation,geolocation_zip_code_prefix,1000163,19015,,numeric,1001.0,99990.0,36574.166466,,
6,geolocation,geolocation_lat,1000163,717360,,numeric,-36.605374,45.065933,-21.176153,,
7,geolocation,geolocation_lng,1000163,717613,,numeric,-101.466766,121.105394,-46.390541,,
8,geolocation,geolocation_city,1000163,8011,,nominal,,,,sao paulo,135800.0
9,geolocation,geolocation_state,1000163,27,,nominal,,,,SP,404268.0


## Inspect Key Columns

In [102]:
df_summaries["key"].sort_values(by="column")

Unnamed: 0,table,column,count,unique_count,duplicated_count
0,customers,customer_id,99441,99441,0.0
11,orders,customer_id,99441,99441,0.0
1,customers,customer_unique_id,99441,96096,3345.0
10,orders,order_id,99441,99441,0.0
18,order_items,order_id,112650,98666,13984.0
25,order_payments,order_id,103886,99440,4446.0
31,order_reviews,order_id,99224,98673,551.0
19,order_items,order_item_id,112650,21,112629.0
50,product_category_name_translation,product_category_name,71,71,0.0
51,product_category_name_translation,product_category_name_english,71,71,0.0


In [97]:
db["customers"][db["customers"]["customer_unique_id"]=="8d50f5eadf50201ccdcedfb9e2ac8455"]

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
14186,1bd3585471932167ab72a84955ebefea,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
15321,a8fabc805e9a10a3c93ae5bff642b86b,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
16654,897b7f72042714efaa64ac306ba0cafc,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
36122,b2b13de0770e06de50080fea77c459e6,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
38073,42dbc1ad9d560637c9c4c1533746f86d,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
40141,dfb941d6f7b02f57a44c3b7c3fefb44b,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
48614,65f9db9dd07a4e79b625effa4c868fcb,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
52574,1c62b48fb34ee043310dcb233caabd2e,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
58707,a682769c4bc10fc6ef2101337a6c83c9,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
67996,6289b75219d757a56c0cce8d9e427900,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP


## Inspect Nominal Columns

In [51]:
df_summaries["nominal"]

Unnamed: 0,table,column,count,unique_count,mode,mode_count
1,customers,customer_unique_id,99441,96096,8d50f5eadf50201ccdcedfb9e2ac8455,17.0
3,customers,customer_city,99441,4119,sao paulo,15540.0
4,customers,customer_state,99441,27,SP,41746.0
8,geolocation,geolocation_city,1000163,8011,sao paulo,135800.0
9,geolocation,geolocation_state,1000163,27,SP,404268.0
18,order_items,order_id,112650,98666,8272b63d03f5f79c56e9e4120aec44ef,21.0
20,order_items,product_id,112650,32951,aca2eb7d00ea1a7b8ebd4e68314663af,527.0
21,order_items,seller_id,112650,3095,6560211a19b47992c3666cc44a7e94c0,2033.0
25,order_payments,order_id,103886,99440,fa65dad1b0e818e3ccc5cb0e39231352,29.0
30,order_reviews,review_id,99224,98410,08528f70f579f0c830189efc523d2182,3.0


## Inspect Categorical Columns

In [52]:
df_summaries["categorical"]

Unnamed: 0,table,column,count,unique_count,mode,mode_count
12,orders,order_status,99441,8,delivered,96478.0
27,order_payments,payment_type,103886,5,credit_card,76795.0


## Inspect Numeric Columns

In [53]:
df_summaries["numeric"]

Unnamed: 0,table,column,count,unique_count,min,max,mean
2,customers,customer_zip_code_prefix,99441,14994,1003.0,99990.0,35137.474583
5,geolocation,geolocation_zip_code_prefix,1000163,19015,1001.0,99990.0,36574.166466
6,geolocation,geolocation_lat,1000163,717360,-36.605374,45.065933,-21.176153
7,geolocation,geolocation_lng,1000163,717613,-101.466766,121.105394,-46.390541
19,order_items,order_item_id,112650,21,1.0,21.0,1.197834
23,order_items,price,112650,5968,0.85,6735.0,120.653739
24,order_items,freight_value,112650,6999,0.0,409.68,19.99032
26,order_payments,payment_sequential,103886,29,1.0,29.0,1.092679
28,order_payments,payment_installments,103886,24,0.0,24.0,2.853349
29,order_payments,payment_value,103886,29077,0.0,13664.08,154.10038


## Inspect Temporal Columns

In [54]:
df_summaries["temporal"]

Unnamed: 0,table,column,count,unique_count,min_date,max_date,range
13,orders,order_purchase_timestamp,99441,98875,2016-09-04 21:15:19,2018-10-17 17:30:18,772 days 20:14:59
14,orders,order_approved_at,99441,90733,2016-09-15 12:16:38,2018-09-03 17:40:06,718 days 05:23:28
15,orders,order_delivered_carrier_date,99441,81018,2016-10-08 10:34:01,2018-09-11 19:48:28,703 days 09:14:27
16,orders,order_delivered_customer_date,99441,95664,2016-10-11 13:46:32,2018-10-17 13:22:46,735 days 23:36:14
17,orders,order_estimated_delivery_date,99441,459,2016-09-30 00:00:00,2018-11-12 00:00:00,773 days 00:00:00
22,order_items,shipping_limit_date,112650,93318,2016-09-19 00:15:34,2020-04-09 22:35:08,1298 days 22:19:34
35,order_reviews,review_creation_date,99224,636,2016-10-02 00:00:00,2018-08-31 00:00:00,698 days 00:00:00
36,order_reviews,review_answer_timestamp,99224,98248,2016-10-07 18:32:28,2018-10-29 12:27:35,751 days 17:55:07


## Analysis of Potential Foreign Key Relationships

Matrix of matching_ratios of all columns

In [None]:
from tqdm.notebook import tqdm

def compute_match_ratio(from_column, to_column):
    from_set = set(from_column.dropna())
    to_set = set(to_column.dropna())

    # Number of matching entries
    matching_entries = len(from_set.intersection(to_set))

    # Total unique entries in both columns
    total_entries = len(from_set.union(to_set))

    return matching_entries / total_entries if total_entries != 0 else 0

# Iterate over each table and column
db_columns = [(table, column) for table in db.keys() for column in db[table].columns]

data = {}

# Creating a single progress bar for inner loops
inner_pbar = tqdm(total=len(db_columns), desc='Inner Loop', unit='column', leave=False)

# Outer loop with its own progress bar
for tab_a, col_a in tqdm(db_columns, desc='Outer Loop', unit='column'):
    
    # Reset the inner progress bar after each iteration of the outer loop
    inner_pbar.n = 0
    inner_pbar.desc = f"{tab_a}/{col_a}"
    inner_pbar.last_print_n = 0
    inner_pbar.refresh()
    
    for tab_b, col_b in db_columns:
        key = (tab_a, col_a)
        if key not in data:
            data[key] = []
        data[key].append(compute_match_ratio(db[tab_a][col_a], db[tab_b][col_b]))
        
        # Update the inner progress bar
        inner_pbar.update(1)

df = pd.DataFrame(data, index=db_columns)
df


Export matching_ratio matrix to Excel

In [None]:
df.to_excel(overlap_ratio_file)

## Analysis of Missing Value Relationships Across Tables

Quantifying inter-table key relationships and overlaps

In [62]:
# Initialize the lists to store the table/key mappings
from_tables = []
from_columns = []
to_tables = []
to_columns = []

# Iterate over each table and column
for key in db:
    for column in db[key].columns:
        # If the column ends with "_id", it's potentially a foreign key
        if column.endswith("_id"):
            # Search for potential primary keys in other tables
            for potential_key in db:
                if column in db[potential_key].columns and key != potential_key:
                    from_tables.append(key)
                    from_columns.append(column)
                    to_tables.append(potential_key)
                    to_columns.append(column)

# Construct the mapping dataframe
df_mapping = pd.DataFrame({
    'from_table': from_tables,
    'from_column': from_columns,
    'to_table': to_tables,
    'to_column': to_columns
})

###
### the logic above is already implemented in Database.column_mapping
##

def is_subset(row):
    """Check if one column's unique values are a subset of the other column's unique values."""
    from_set = set(db[row["from_table"]][row["from_column"]].dropna())
    to_set = set(db[row["to_table"]][row["to_column"]].dropna())
    return from_set.issubset(to_set)

df_mapping["is_subset"] = df_mapping.apply(is_subset, axis=1)

def subset_ratio(row):
    """Compute the ratio of unique values from 'from_column' found in 'to_column'."""
    from_set = set(db[row["from_table"]][row["from_column"]].dropna())
    to_set = set(db[row["to_table"]][row["to_column"]].dropna())
    return len(from_set.intersection(to_set)) / len(from_set) if len(from_set) != 0 else 0

df_mapping["subset_ratio"] = df_mapping.apply(subset_ratio, axis=1)

def shared_value_ratio(row):
    """Compute the ratio of shared unique values between two columns."""
    from_set = set(db[row["from_table"]][row["from_column"]].dropna())
    to_set = set(db[row["to_table"]][row["to_column"]].dropna())

    # Number of matching entries
    matching_entries = len(from_set.intersection(to_set))

    # Total unique entries in both columns
    total_entries = len(from_set.union(to_set))

    return matching_entries / total_entries if total_entries != 0 else 0

df_mapping["shared_value_ratio"] = df_mapping.apply(shared_value_ratio, axis=1)

def missing_count(row):
    from_set = set(db[row["from_table"]][row["from_column"]].dropna())
    to_set = set(db[row["to_table"]][row["to_column"]].dropna())
    
    # Missing count
    return len(from_set.difference(to_set))

df_mapping["missing_count"] = df_mapping.apply(missing_count, axis=1)

df_mapping.sort_values(by="missing_count", ascending=True)



Unnamed: 0,from_table,from_column,to_table,to_column,is_subset,subset_ratio,shared_value_ratio,missing_count
0,customers,customer_id,orders,customer_id,True,1.0,1.0,0
13,order_reviews,order_id,orders,order_id,True,1.0,0.992277,0
10,order_payments,order_id,orders,order_id,True,1.0,0.99999,0
9,order_items,seller_id,sellers,seller_id,True,1.0,1.0,0
16,products,product_id,order_items,product_id,True,1.0,1.0,0
8,order_items,product_id,products,product_id,True,1.0,1.0,0
4,orders,customer_id,customers,customer_id,True,1.0,1.0,0
5,order_items,order_id,orders,order_id,True,1.0,0.992206,0
17,sellers,seller_id,order_items,seller_id,True,1.0,1.0,0
6,order_items,order_id,order_payments,order_id,False,0.99999,0.992196,1
