<h1>Data Cleaning</h1>

# Imports

In [1]:
import sys
from pathlib import Path
from datetime import datetime

import pandas as pd

sys.path.insert(0, r"C:\Users\vynde\PycharmProjects\dataanalysis")
from databridger import Database

# Issue Tracking

In [2]:
# TODO: export class to module and import it

class IssueTracker:
    VALID_FIELDS = ["description", "resolution", "severity", "potential_cause", "relevant_data", "notes"]
    FORMAT_LINE_BREAK = {
        'description': lambda x: str(x).replace("\n", "<br>"), 
        'notes': lambda x: str(x).replace("\n", "<br>"), 
        'relevant_data': lambda x: str(x).replace("\n", "<br>")}

    def __init__(self):
        self.df = pd.DataFrame(columns=["issue_id", "version", "status", *self.VALID_FIELDS])
        self.issue_count = 0
    
    def __repr__(self):
        return self.df.__repr__()

    def _repr_html_(self):
        return self.df._repr_html_()

    def show(self):
        return self.df.style.format(self.FORMAT_LINE_BREAK)
    
    def show_latest_versions(self):
        return self.df.groupby("issue_id").last().style.format(self.FORMAT_LINE_BREAK)
            
    def add_issue(self, description, severity, potential_cause=None, relevant_data=None, notes=None):
        issue = {
            "issue_id": self.issue_count + 1,
            "version": 1, 
            "status": "Open",
            "description": description,
            "resolution": None,
            "severity": severity,
            "potential_cause": potential_cause,
            "relevant_data": relevant_data,
            "timestamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            "notes": notes
        }
        self.df = pd.concat([self.df, pd.DataFrame([issue])], ignore_index=True)
        
        self.issue_count += 1

    def update_issue(self, /, status=None, description=None, severity=None, potential_cause=None, relevant_data=None, notes=None, issue_id=None, resolution=None):

        if issue_id is None:
            issue = self.df.iloc[-1]
        else:
            issue = self.df.loc[self.df["issue_id"]==issue_id, :].iloc[-1]

        new_issue = issue.copy()

        new_issue["version"] = issue["version"] + 1

        if (new_issue["status"] == "resolved") and (issue["issue_id"] == new_issue["issue_id"]):
            raise Exception("Issue already resolved.")
        
        # Apply the updates
        if status is not None:
            new_issue["status"] = status
        if description is not None:
            new_issue["description"] = description
        if severity is not None:
            new_issue["severity"] = severity
        if potential_cause is not None:
            new_issue["potential_cause"] = potential_cause
        if relevant_data is not None:
            new_issue["relevant_data"] = relevant_data
        if resolution is not None:
            new_issue["resolution"] = resolution
        
        # overwrite notes always
        new_issue["notes"] = notes
        
        self.df = pd.concat([self.df, pd.DataFrame([new_issue])])

    def resolve_issue(self, resolution, issue_id=None):
        if not issue_id:
            issue_id = self.issue_count

        self.update_issue(status="resolved", resolution=resolution, issue_id=issue_id)

    def export_to_csv(self, filename="issues.csv"):
        self.df.to_csv(filename, index=False)

# create instance
issue_tracker = IssueTracker()

# Resources

In [3]:
# folders
raw_data_folder = Path('..') / 'data' / 'raw'
report_folder = Path('..') / 'reports'

# output files
database_summary_file = report_folder / "database_summary.xlsx"
overlap_ratio_file = report_folder / "column_overlap_ratios.xlsx"

---

# Load Data

New method for loading file based data base

In [4]:
#db_raw = Database("csv", raw_data_folder)
#db_raw.table_mapping

Inspect file names

In [5]:
for csv_file in raw_data_folder.glob('*.csv'):
    print(csv_file)

..\data\raw\olist_customers_dataset.csv
..\data\raw\olist_geolocation_dataset.csv
..\data\raw\olist_orders_dataset.csv
..\data\raw\olist_order_items_dataset.csv
..\data\raw\olist_order_payments_dataset.csv
..\data\raw\olist_order_reviews_dataset.csv
..\data\raw\olist_products_dataset.csv
..\data\raw\olist_sellers_dataset.csv
..\data\raw\product_category_name_translation.csv


Load all data sets into a database (dictionary of dataframes)

In [6]:
db = dict()  # database
for csv_file in raw_data_folder.glob('*.csv'):
    dataset_name = str(csv_file.stem).replace("olist_", "").replace("_dataset", "")
    db[dataset_name] = pd.read_csv(csv_file)

db.keys()

dict_keys(['customers', 'geolocation', 'orders', 'order_items', 'order_payments', 'order_reviews', 'products', 'sellers', 'product_category_name_translation'])

Convert to datetime matching this format yyyy-mm-dd HH:MM:SS

In [7]:
for key in db:
    for column in db[key].columns:
        if db[key][column].dtype == 'object':
            if all(db[key][column].str.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}")):
                db[key][column] = pd.to_datetime(db[key][column])
                print(f"converted:   {key:15} / {column:30} from object to {db[key][column].dtype}")

# also tried another version with try except trying to convert every object-type column to datetime
# showed a very strange behavior and messed up all types in the dataframe


converted:   orders          / order_purchase_timestamp       from object to datetime64[ns]
converted:   orders          / order_approved_at              from object to datetime64[ns]
converted:   orders          / order_delivered_carrier_date   from object to datetime64[ns]
converted:   orders          / order_delivered_customer_date  from object to datetime64[ns]
converted:   orders          / order_estimated_delivery_date  from object to datetime64[ns]
converted:   order_items     / shipping_limit_date            from object to datetime64[ns]
converted:   order_reviews   / review_creation_date           from object to datetime64[ns]
converted:   order_reviews   / review_answer_timestamp        from object to datetime64[ns]


> **Summary**: 
>- Loaded 9 CSV files into a dictionary, effectively creating a database.
>- Identified 8 columns as datetime columns

---

# Inspect Data

In [8]:
for name, df in db.items():
    print(f"{name}: {df.shape}")

customers: (99441, 5)
geolocation: (1000163, 5)
orders: (99441, 8)
order_items: (112650, 7)
order_payments: (103886, 5)
order_reviews: (99224, 7)
products: (32951, 9)
sellers: (3095, 4)
product_category_name_translation: (71, 2)


## Database Column Characterization & Classification

In [9]:
tables = [key for key in db for column in db[key]]
columns = [column for key in db for column in db[key]]
summary = []

for table, column in zip(tables, columns):
    series = db[table][column]
    data = {"table": table, "column": column}
    
    # Common metrics for all types
    data["count"] = len(series)
    unique_count = series.nunique()
    data["unique_count"] = unique_count

    # Determine if the column is a key
    if (unique_count == len(series)) \
    or (column.endswith("_id")):
        data["duplicated_count"] = len(series) - series.drop_duplicates().size
        data["type"] = "key"
    # Determine if the column is temporal
    elif series.dtype in ['datetime64[ns]', 'datetime64[ns, tz]']:
        data["min_date"] = series.min()
        data["max_date"] = series.max()
        data["range"] = series.max() - series.min()
        data["type"] = "temporal"
    # Determine if the column is numeric
    elif series.dtype in ['int64', 'float64']:
        data["min"] = series.min()
        data["max"] = series.max()
        data["mean"] = series.mean()
        data["type"] = "numeric"
    # Determine if the column is nominal (e.g. more than 10 unique text values)
    elif series.dtype == 'object' and unique_count > 10:
        mode_data = series.mode()
        data["mode"] = mode_data[0] if not mode_data.empty else None
        data["mode_count"] = (series == data["mode"]).sum()
        data["type"] = "nominal"
    # Determine if the column is categorical (e.g. 10 or fewer unique text values)
    elif series.dtype == 'object' and unique_count <= 10:
        mode_data = series.mode()
        data["mode"] = mode_data[0] if not mode_data.empty else None
        data["mode_count"] = (series == data["mode"]).sum()
        data["type"] = "categorical"
    else:
        data["type"] = "unknown"
    
    # Append the computed metrics for the column to the summary list
    summary.append(data)

# Compile the summary list into a DataFrame for a neat presentation
df_summary = pd.DataFrame(summary)

# Split the dataframes by type and drop nan columns
df_summaries = {name: df.dropna(axis=1).drop(columns="type") for name ,df in df_summary.groupby("type")}

In [10]:
df_summary

Unnamed: 0,table,column,count,unique_count,duplicated_count,type,min,max,mean,mode,mode_count,min_date,max_date,range
0,customers,customer_id,99441,99441,0.0,key,,,,,,NaT,NaT,NaT
1,customers,customer_unique_id,99441,96096,3345.0,key,,,,,,NaT,NaT,NaT
2,customers,customer_zip_code_prefix,99441,14994,,numeric,1003.0,99990.0,35137.474583,,,NaT,NaT,NaT
3,customers,customer_city,99441,4119,,nominal,,,,sao paulo,15540.0,NaT,NaT,NaT
4,customers,customer_state,99441,27,,nominal,,,,SP,41746.0,NaT,NaT,NaT
5,geolocation,geolocation_zip_code_prefix,1000163,19015,,numeric,1001.0,99990.0,36574.166466,,,NaT,NaT,NaT
6,geolocation,geolocation_lat,1000163,717360,,numeric,-36.605374,45.065933,-21.176153,,,NaT,NaT,NaT
7,geolocation,geolocation_lng,1000163,717613,,numeric,-101.466766,121.105394,-46.390541,,,NaT,NaT,NaT
8,geolocation,geolocation_city,1000163,8011,,nominal,,,,sao paulo,135800.0,NaT,NaT,NaT
9,geolocation,geolocation_state,1000163,27,,nominal,,,,SP,404268.0,NaT,NaT,NaT


## Inspect Key Columns

In [11]:
df_summaries["key"].sort_values(by="column")

Unnamed: 0,table,column,count,unique_count,duplicated_count
0,customers,customer_id,99441,99441,0.0
11,orders,customer_id,99441,99441,0.0
1,customers,customer_unique_id,99441,96096,3345.0
10,orders,order_id,99441,99441,0.0
18,order_items,order_id,112650,98666,13984.0
25,order_payments,order_id,103886,99440,4446.0
31,order_reviews,order_id,99224,98673,551.0
19,order_items,order_item_id,112650,21,112629.0
50,product_category_name_translation,product_category_name,71,71,0.0
51,product_category_name_translation,product_category_name_english,71,71,0.0


Inspect why customer_id is unique in customers and orders table

In [12]:
issue_tracker.add_issue(
    description="unique key in multiple tables",
    severity="minor",
    potential_cause=None,
    relevant_data="customers -> customer_id;\n orders -> customer_id",
    notes="""one-to-one relationship; 
    tables could be merged; 
    each customer has done exactly one order; 
    structure intenional?; 
    check if cutomer locations if they are also different for all customers"""
)

Inspect duplicated customer_unique_ids

In [13]:
issue_tracker.add_issue(
    description="duplicated key values",
    severity="minor",
    potential_cause="",
    relevant_data="customers -> customer_unique_id"
)

In [14]:
counts = db["customers"]["customer_unique_id"].value_counts()
counts[counts > 1]

customer_unique_id
8d50f5eadf50201ccdcedfb9e2ac8455    17
3e43e6105506432c953e165fb2acf44c     9
1b6c7548a2a1f9037c1fd3ddfed95f33     7
ca77025e7201e3b30c44b472ff346268     7
6469f99c1f9dfae7733b25662e7f1782     7
                                    ..
370cd3b09ea745fe047fd11db7430441     2
d7c6bdf20c4ec9c9cda815dde58e778a     2
a366793a4999f8cc135855c4dd347421     2
156283b6ca35ef976a0265903145fd5e     2
d83257348027bd8c59a228cc034de5e3     2
Name: count, Length: 2997, dtype: int64

In [15]:
issue_tracker.update_issue(notes=f"there are {len(counts)} keys with more than 1 entries")

Inspect one occurrence

In [16]:
db["customers"][db["customers"]["customer_unique_id"]==counts.index[0]]

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
14186,1bd3585471932167ab72a84955ebefea,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
15321,a8fabc805e9a10a3c93ae5bff642b86b,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
16654,897b7f72042714efaa64ac306ba0cafc,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
36122,b2b13de0770e06de50080fea77c459e6,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
38073,42dbc1ad9d560637c9c4c1533746f86d,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
40141,dfb941d6f7b02f57a44c3b7c3fefb44b,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
48614,65f9db9dd07a4e79b625effa4c868fcb,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
52574,1c62b48fb34ee043310dcb233caabd2e,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
58707,a682769c4bc10fc6ef2101337a6c83c9,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
67996,6289b75219d757a56c0cce8d9e427900,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP


In [17]:
issue_tracker.update_issue(
    potential_cause="specifies customers with same zip code, city and state",
    notes="inspect all ids")

In [18]:
#equal_rows = []
#for id in counts.index:
#    subset_id = db["customers"][db["customers"]["customer_unique_id"]==id]
#    rows_with_unique_location = subset_id.drop_duplicates(subset=db["customers"].columns[-3:])
#    has_same_location_data = (1 == len(rows_with_unique_location))
#    equal_rows.append(equal_rows)
#
# optimize speed

for loc, i in zip(["zip code level", "city level", "state level"],[-3,-2,-1]):
    def has_single_unique_location(subset):
        unique_location_rows = subset.drop_duplicates(subset=subset.columns[i:])
        return len(unique_location_rows) == 1

    repeated_customer_ids = counts[counts > 1]
    customers_with_repeated_ids = db["customers"][db["customers"]["customer_unique_id"].isin(repeated_customer_ids.index)]
    is_location_unique_for_id = customers_with_repeated_ids.groupby("customer_unique_id").apply(has_single_unique_location)
    ids_with_different_locations = is_location_unique_for_id[~is_location_unique_for_id].index.tolist()

    print(f"Number of ids with different location data on {loc}: {len(ids_with_different_locations)}")

Number of ids with different location data on zip code level: 252
Number of ids with different location data on city level: 122
Number of ids with different location data on state level: 39


In [19]:
issue_tracker.update_issue(notes="252 unique ids with different location data;\nCould not identify purpose of column;\n Drop column: customer_unique_id")

column `order_id` is unique in orders table and duplicated in `order_items`, `order_payments`, `order_reviews`. check if there are missing keys

Inspect duplicated values of order_id

In [20]:
issue_tracker.add_issue(
    description="duplicated key values",
    severity="minor",
    potential_cause="foreign key",
    relevant_data="order_items -> oder_id;\norder_payments -> oder_id;\norder_reviews -> oder_id;\n",
    notes="order_ids in other tables must be a subset of order_ids in orders table;\notherwise there are missing values")

In [21]:
tables_to_verify_subset = ["order_items", "order_payments", "order_reviews"]

for current_table in tables_to_verify_subset:
    is_order_id_subset = set(db[current_table]["order_id"]).issubset(set(db["orders"]["order_id"]))
    print(f"Order IDs in {current_table} are a subset of 'orders' table: {is_order_id_subset}")


Order IDs in order_items are a subset of 'orders' table: True
Order IDs in order_payments are a subset of 'orders' table: True
Order IDs in order_reviews are a subset of 'orders' table: True


In [22]:
issue_tracker.update_issue(notes="is subset verified")
issue_tracker.resolve_issue(resolution="Verified that foreign keys are a subset of the primary key in orders table")

Inspect duplicated values of order_item_id

In [23]:
issue_tracker.add_issue(
    description="duplicated key values", 
    potential_cause="not a key column",
    severity="minor")

In [24]:
db["order_items"]["order_item_id"].value_counts()

order_item_id
1     98666
2      9803
3      2287
4       965
5       460
6       256
7        58
8        36
9        28
10       25
11       17
12       13
13        8
14        7
15        5
16        3
17        3
18        3
19        3
20        3
21        1
Name: count, dtype: int64

In [25]:
issue_tracker.update_issue(potential_cause="signifies the item_id within an order", notes="check for duplicated within orders")

In [26]:
def check_unique_order_items(group):
    return group["order_item_id"].is_unique

unique_order_items_check = db["order_items"].groupby("order_id").apply(check_unique_order_items)

print(f"All order_item_ids are unique for each order_id: {all(unique_order_items_check)}")


All order_item_ids are unique for each order_id: True


In [27]:
issue_tracker.resolve_issue(resolution="signifies the item_id within an order")

Inspect product_ids in order_items

In [28]:
issue_tracker.add_issue(
    description="duplicated key values", 
    severity="minor",
    potential_cause="foreign key", 
    relevant_data="order_items -> product_id", 
    notes="check if product_ids on order_items are a subset of product_ids in product")

In [29]:
tables_to_verify_subset = ["order_items"]

for current_table in tables_to_verify_subset:
    is_order_id_subset = set(db[current_table]["product_id"]).issubset(set(db["products"]["product_id"]))
    print(f"Product IDs in {current_table} are a subset of 'products' table: {is_order_id_subset}")


Product IDs in order_items are a subset of 'products' table: True


In [30]:
issue_tracker.resolve_issue(resolution="Verified that foreign keys are a subset of primary key")

Inspect duplicated counts of review_id

In [31]:
issue_tracker.add_issue(description="duplicated key values", severity="moderate", potential_cause="duplicated values", notes="inspect values with duplicated counts")

In [32]:
counts = db["order_reviews"]["review_id"].value_counts()
counts[counts > 1]

review_id
7b606b0d57b078384f0b58eac1d41d78    3
dbdf1ea31790c8ecfcc6750525661a9b    3
32415bbf6e341d5d517080a796f79b5c    3
0c76e7a547a531e7bf9f0b99cba071c1    3
4219a80ab469e3fc9901437b73da3f75    3
                                   ..
10398bad17ab17c451c39a0b2c47464c    2
ab0abebc25981f01b696983460927f44    2
b86b60b19d7ff8f19b2e7998ab14f5ca    2
b4dffda6cbf1a5a615b2a8f146bdbddd    2
26c7339968774f98c7b496b924e97913    2
Name: count, Length: 789, dtype: int64

In [33]:
issue_tracker.update_issue(notes=f"there are {len(counts)} keys with more than 1 entries")

In [34]:
db["order_reviews"][db["order_reviews"]["review_id"]==counts.index[0]]

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
7500,7b606b0d57b078384f0b58eac1d41d78,f3028a8f41ea1ee2b461420913663f97,5,,,2017-02-15,2017-02-21 23:30:22
59859,7b606b0d57b078384f0b58eac1d41d78,2deb17060fc1ce18a85eba953ddcdeaf,5,,,2017-02-15,2017-02-21 23:30:22
61069,7b606b0d57b078384f0b58eac1d41d78,2f8f31eb2f7b6572836d662a6625c8e4,5,,,2017-02-15,2017-02-21 23:30:22


In [35]:
issue_tracker.update_issue(notes="duplicated row, also same timestamp, but order_id is different\ncheck if this is the case for all")

In [36]:
def has_duplicate_rows(group):
    distinct_rows = group.drop_duplicates(subset=group.columns[-5:])
    return len(distinct_rows) > 1

repeated_review_ids = counts[counts > 1]
filtered_reviews = db["order_reviews"][db["order_reviews"]["review_id"].isin(repeated_review_ids.index)]
duplicate_check_per_review = filtered_reviews.groupby("review_id").apply(has_duplicate_rows)
review_ids_with_duplicates = duplicate_check_per_review[duplicate_check_per_review].index.tolist()

print(f"Number of duplicated review_ids where the last 5 columns are different: {len(review_ids_with_duplicates)}")

Number of duplicated review_ids where the last 5 columns are different: 0


In [37]:
issue_tracker.update_issue(notes="duplicated review_ids are real duplicates. can be dropped. first determine why order_ids are still different.")

In [38]:
issue_tracker.show_latest_versions()

Unnamed: 0_level_0,version,status,description,resolution,severity,potential_cause,relevant_data,notes,timestamp
issue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,Open,unique key in multiple tables,,minor,,customers -> customer_id;  orders -> customer_id,one-to-one relationship; tables could be merged; each customer has done exactly one order; structure intenional?; check if cutomer locations if they are also different for all customers,2023-08-12 14:20:58
2,4,Open,duplicated key values,,minor,"specifies customers with same zip code, city and state",customers -> customer_unique_id,252 unique ids with different location data; Could not identify purpose of column;  Drop column: customer_unique_id,2023-08-12 14:20:58
3,3,resolved,duplicated key values,Verified that foreign keys are a subset of the primary key in orders table,minor,foreign key,order_items -> oder_id; order_payments -> oder_id; order_reviews -> oder_id;,is subset verified,2023-08-12 14:21:00
4,3,resolved,duplicated key values,signifies the item_id within an order,minor,signifies the item_id within an order,,check for duplicated within orders,2023-08-12 14:21:00
5,2,resolved,duplicated key values,Verified that foreign keys are a subset of primary key,minor,foreign key,order_items -> product_id,check if product_ids on order_items are a subset of product_ids in product,2023-08-12 14:21:03
6,4,Open,duplicated key values,,moderate,duplicated values,,duplicated review_ids are real duplicates. can be dropped. first determine why order_ids are still different.,2023-08-12 14:21:03


## Inspect Nominal Columns

In [None]:
df_summaries["nominal"]

## Inspect Categorical Columns

In [None]:
df_summaries["categorical"]

## Inspect Numeric Columns

In [None]:
df_summaries["numeric"]

## Inspect Temporal Columns

In [None]:
df_summaries["temporal"]

## Analysis of Potential Foreign Key Relationships

Matrix of matching_ratios of all columns

In [None]:
from tqdm.notebook import tqdm

def compute_match_ratio(from_column, to_column):
    from_set = set(from_column.dropna())
    to_set = set(to_column.dropna())

    # Number of matching entries
    matching_entries = len(from_set.intersection(to_set))

    # Total unique entries in both columns
    total_entries = len(from_set.union(to_set))

    return matching_entries / total_entries if total_entries != 0 else 0

# Iterate over each table and column
db_columns = [(table, column) for table in db.keys() for column in db[table].columns]

data = {}

# Creating a single progress bar for inner loops
inner_pbar = tqdm(total=len(db_columns), desc='Inner Loop', unit='column', leave=False)

# Outer loop with its own progress bar
for tab_a, col_a in tqdm(db_columns, desc='Outer Loop', unit='column'):
    
    # Reset the inner progress bar after each iteration of the outer loop
    inner_pbar.n = 0
    inner_pbar.desc = f"{tab_a}/{col_a}"
    inner_pbar.last_print_n = 0
    inner_pbar.refresh()
    
    for tab_b, col_b in db_columns:
        key = (tab_a, col_a)
        if key not in data:
            data[key] = []
        data[key].append(compute_match_ratio(db[tab_a][col_a], db[tab_b][col_b]))
        
        # Update the inner progress bar
        inner_pbar.update(1)

df = pd.DataFrame(data, index=db_columns)
df


Export matching_ratio matrix to Excel

In [None]:
df.to_excel(overlap_ratio_file)

## Analysis of Missing Value Relationships Across Tables

Quantifying inter-table key relationships and overlaps

In [None]:
# Initialize the lists to store the table/key mappings
from_tables = []
from_columns = []
to_tables = []
to_columns = []

# Iterate over each table and column
for key in db:
    for column in db[key].columns:
        # If the column ends with "_id", it's potentially a foreign key
        if column.endswith("_id"):
            # Search for potential primary keys in other tables
            for potential_key in db:
                if column in db[potential_key].columns and key != potential_key:
                    from_tables.append(key)
                    from_columns.append(column)
                    to_tables.append(potential_key)
                    to_columns.append(column)

# Construct the mapping dataframe
df_mapping = pd.DataFrame({
    'from_table': from_tables,
    'from_column': from_columns,
    'to_table': to_tables,
    'to_column': to_columns
})

###
### the logic above is already implemented in Database.column_mapping
##

def is_subset(row):
    """Check if one column's unique values are a subset of the other column's unique values."""
    from_set = set(db[row["from_table"]][row["from_column"]].dropna())
    to_set = set(db[row["to_table"]][row["to_column"]].dropna())
    return from_set.issubset(to_set)

df_mapping["is_subset"] = df_mapping.apply(is_subset, axis=1)

def subset_ratio(row):
    """Compute the ratio of unique values from 'from_column' found in 'to_column'."""
    from_set = set(db[row["from_table"]][row["from_column"]].dropna())
    to_set = set(db[row["to_table"]][row["to_column"]].dropna())
    return len(from_set.intersection(to_set)) / len(from_set) if len(from_set) != 0 else 0

df_mapping["subset_ratio"] = df_mapping.apply(subset_ratio, axis=1)

def shared_value_ratio(row):
    """Compute the ratio of shared unique values between two columns."""
    from_set = set(db[row["from_table"]][row["from_column"]].dropna())
    to_set = set(db[row["to_table"]][row["to_column"]].dropna())

    # Number of matching entries
    matching_entries = len(from_set.intersection(to_set))

    # Total unique entries in both columns
    total_entries = len(from_set.union(to_set))

    return matching_entries / total_entries if total_entries != 0 else 0

df_mapping["shared_value_ratio"] = df_mapping.apply(shared_value_ratio, axis=1)

def missing_count(row):
    from_set = set(db[row["from_table"]][row["from_column"]].dropna())
    to_set = set(db[row["to_table"]][row["to_column"]].dropna())
    
    # Missing count
    return len(from_set.difference(to_set))

df_mapping["missing_count"] = df_mapping.apply(missing_count, axis=1)

df_mapping.sort_values(by="missing_count", ascending=True)

