In [2]:
# NOTE This notebook demonstrate that cleaning the data is not a trivial task.
# Even if the operations to perform are trivial (since quite everything is handled by pandas),
# there is certainly a thinking process before using those easy and ready to use pandas functions.

import pandas as pd
import numpy as np
import sqlalchemy

engine = sqlalchemy.create_engine("postgresql://trobin:mysecretpassword@localhost:5432/piscineds")
with engine.connect() as connection:
    # Load the table
    df_items = pd.read_sql_table('items', connection)

In [None]:
# First of all, we make sure there is no `None` nor `NaN` values for the `product_id` column
# We can see there was no such values since the shape of the dataframe remains the same
print(df_items.shape)
df_items.dropna(subset=['product_id'], inplace=True)
print(df_items.shape)

In [None]:
# If there would be `None` or `NaN` values for any `product_id` field,
# we might see some of them below (they would appear at the bottom)
df_items.sort_values('product_id', inplace=True)
print(df_items.head(2))
print(df_items.tail(2))

# All of them would also be printed out this way
# df_dup = df[df.duplicated('product_id', keep=False)]
df_null = df_items[df_items['product_id'].isnull()]
print(df_null)

In [None]:
# So you can see all of this would work, consider a dataframe with such 'not available' values
df = pd.DataFrame({
    'product_id': [1, None, np.nan],
    'category_id': [1, 42, 21],
    'brand': ['foo', None, 'bar']
})

# So you see this actually print NaNs
df_null = df[df['product_id'].isnull()]
print(df_null)

print(df)
df.dropna(subset=['product_id'], inplace=True)
print(df)

In [None]:
# Secondly, we don't want to keep multiple records with the same `produt_id` value
# If there would be such, they would also be printed here
df_duplicates = df_items[df_items.duplicated('product_id', keep=False)]
print(df_duplicates)

In [None]:
# At this point, nothing more has to be done to clean the data
# Missing information in other rows is prejudiciable but not critical
# However, if there would be such duplicates values for the `product_id` column, we might perform subsequent operations.

In [24]:
# Just to show this would work if there would be such duplicates, here is a fictionnal demonstration
df_ = pd.DataFrame({
    'id': [1, 1, 1, 2],
    'foo': ['foo', None, None, 'foo'],
    'bar': [None, 'bar', None, 'bar'],
    'baz': [None, None, 'baz', 'baz']
})
print(df_.to_string(index=False), "\n")

# Here, we only keep the first occurence
# this might not be the best occurence to keep,
# since it might lack of data in some columns, that removed duplicates didn't.
df_dup = df_.drop_duplicates(subset='id')
print(df_dup.to_string(index=False), "\n")

# Keeping only one non-modified duplicate row, we could choose to prioritize non-missing data of some columns over others
# Here, we decide to prioritize data from 'baz' column, then 'bar', then 'foo'
df_.sort_values(by=['baz', 'bar', 'foo'], inplace=True)
print(df_.to_string(index=False), "\n")

# Hence you see the remaining duplicate is the one with the 'baz' column non empty
df_dup = df_.drop_duplicates(subset='id')
print(df_dup.to_string(index=False), "\n")

 id  foo  bar  baz
  1  foo None None
  1 None  bar None
  1 None None  baz
  2  foo  bar  baz 

 id foo  bar  baz
  1 foo None None
  2 foo  bar  baz 

 id  foo  bar  baz
  2  foo  bar  baz
  1 None None  baz
  1 None  bar None
  1  foo None None 

 id  foo  bar baz
  2  foo  bar baz
  1 None None baz 



In [33]:
# So in this case, the best solution is to edit some rows, precisely to merge some of them.
# Let's use the same dataframe
df = pd.DataFrame({
    'id': [1, 1, 1, 2],
    'foo': ['foo', None, None, 'foo'],
    'bar': [None, 'bar', None, 'bar'],
    'baz': [None, None, 'baz', 'baz']
})
print(df.to_string(index=False), "\n")

# What we could do is merge those three duplicates rows with the same `id` number
# by replacing all of their None values by available values from any of other duplicate
df = df.groupby('id', as_index=False).first()
print(df.to_string(index=False), "\n")

 id  foo  bar  baz
  1  foo None None
  1 None  bar None
  1 None None  baz
  2  foo  bar  baz 

 id foo bar baz
  1 foo bar baz
  2 foo bar baz 

