# 4.5 DATA CONSISTENCY CHECKS 

## CONTENTS:
1. Import libraries and data. 
2. Check and clean data.
3. Export data.

### 1. Import libraries

In [None]:
import pandas as pd
import numpy as np
import os

In [12]:
# define path
path = r'C:\Users\susan\Documents\data analytics\Instacart Basket Analysis\02 Data'

In [14]:
# import products (original)
df_prods = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'), index_col=False)

In [16]:
# import departments wrangled
df_ords = pd.read_csv(os.path.join(path, 'Prepared Data', 'orders_wrangled.csv'), index_col=False)

In [18]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,11.0,11.0,10.0
mean,1923450.0,1.0,6.0,2.636364,10.090909,19.0
std,1071950.0,0.0,3.316625,1.286291,3.477198,9.030811
min,431534.0,1.0,1.0,1.0,7.0,0.0
25%,869017.0,1.0,3.5,1.5,7.5,14.25
50%,2295261.0,1.0,6.0,3.0,8.0,19.5
75%,2544846.0,1.0,8.5,4.0,13.0,26.25
max,3367565.0,1.0,11.0,4.0,16.0,30.0


In [20]:
# Create a dataframe
df_test = pd.DataFrame()

In [22]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [24]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [32]:
# Check for mixed types
for col in df_test.columns:
    expected_type = type(df_test[col].iloc[0])
    weird = df_test[col].apply(lambda x: type(x) != expected_type)
    if weird.any():
         print(col)

mix


In [34]:
# Convert column data tapa from numeric to string
df_test['mix'] = df_test['mix'].astype('str')

In [36]:
# Find missing values in df_prods 
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [48]:
# View missing values from above
df_nan = df_prods[df_prods['product_name'].isnull() == True]


In [50]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [52]:
# Run df_prods.shape to check rows (to compare after ops)
df_prods.shape

(49693, 5)

In [54]:
# Create a new data frame with no missing values (false)
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [56]:
# Check if nr of rows have decreased with .shape command
df_prods_clean.shape

(49677, 5)

In [58]:
# Create a subset to df_prods_clean with records that are duplicated
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [60]:
# Display the subset
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [64]:
# Count rows in the clean df
df_prods_clean.shape

(49677, 5)

In [66]:
# Create a dataset with no duplicates (unique rows of df_prods_clean)
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [68]:
df_prods_clean_no_dups.shape

(49672, 5)

In [70]:
# EXERCISE 4.5

### 2. Check and clean data

In [74]:
# (2) Check data
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


In [None]:
# Inconsistencies in df_ords (above the exercise section)
# Order day of week cannot have a max of 4 since there are more days.
# Inconsistencies in df_ords
# It is strange that the minimum value is the same over all columns

In [111]:
# (3) Check for mixed types in df_prods
for col in df_prods.columns:
    non_null_values = df_prods[col].dropna()
    if not non_null_values.empty:
        expected_type = type(non_null_values.iloc[0])
        weird = df_prods[col].dropna().apply(lambda x: type(x) != expected_type)
        if weird.any():
            print(col)
            

product_id
aisle_id
department_id
prices


In [113]:
# (3) Check for mixed types in df_ords
for col in df_ords.columns:
    non_null_values = df_ords[col].dropna()
    if not non_null_values.empty:
        expected_type = type(non_null_values.iloc[0])
        weird = df_ords[col].dropna().apply(lambda x: type(x) != expected_type)
        if weird.any():
            print(col)
            

order_id
user_id
order_number
orders_day_of_week
order_hour_of_day
days_since_prior_order


In [135]:
# (4) Convert column data types from numeric to string df_prods
cols_to_convert = ['product_id', 'aisle_id', 'department_id', 'prices']
df_prods[cols_to_convert] = df_prods[cols_to_convert].astype('string')

In [141]:
# (4) Convert column data types from numeric to string df_ords
cols_to_convert2 = ['order_id', 'user_id', 'order_number', 'orders_day_of_week', 'order_hour_of_day', 'days_since_prior_order']
df_ords[cols_to_convert2] = df_ords[cols_to_convert2].astype('string')

In [143]:
# (5) Find missing values in df_ords (prods was done above)
df_ords.isnull().sum()

order_id                  0
user_id                   0
validation                0
order_number              0
orders_day_of_week        0
order_hour_of_day         0
days_since_prior_order    1
dtype: int64

In [None]:
# (5) There is one missing value in days_since_prior_order.
# If the last order was done today then no days have passed. How is the 0 registered? 
# The missing value can actually also be a real missing entry (data inconsistency). 


In [147]:
# (6) View missing values from above
df_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]


In [149]:
# (6) Display the missing value in df_ords
df_nan

Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,


In [171]:
# Check the info.  
df_ords.describe()

Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11,11,11,11,11,11,10.0
unique,11,1,2,11,4,7,9.0
top,2539329,1,prior,1,4,8,14.0
freq,1,11,10,1,4,3,2.0


In [173]:
# Check nr of records in df_ords
df_ords.shape

(11, 7)

In [175]:
df_ords.head(20)

Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [179]:
# Check for duplicates
df_dups_ords = df_ords [df_ords.duplicated()]

In [181]:
df_dups_ords


Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


In [None]:
# I cannot find any duplicates in my df_ords data. 
# I will only address the df_prods duplicates.

In [205]:
from pathlib import Path
path = Path("C:/Users/susan/Documents/data analytics/Instacart Basket Analysis/02 Data/Prepared Data")

In [208]:
print(path)

C:\Users\susan\Documents\data analytics\Instacart Basket Analysis\02 Data\Prepared Data


### 3. Export data.

In [211]:
# (9) export the df_ords to orders_wrangled
df_ords.to_csv(path/'orders_checked.csv', index=False)

In [213]:
# (9) export the df_prods_clean_no_dups to orders_wrangled
df_prods_clean_no_dups.to_csv(path/'products_checked.csv', index=False)