# 4.5 Consistency Check

### This script contains the following points:
#### 1. Import Resources
#### 2. Consistency Checks
#### 2.1. For products.csv
#### 2.2. For orders.csv

### 1. Import Resources

In [7]:
import pandas as pd
import numpy as np
import os

In [8]:
# Store the main project's folder path as a string
path = r'C:\Users\zhoux\Documents\CF-Data Analyst Program\DA-Immersion\10-2023 Instacart Basket Analysis'

In [14]:
# Import the “orders.csv” and “products.csv” data set
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

### 2. Consistency Checks

#### 2.1 For products.csv

##### Check for missing values

In [15]:
# Find missing values in df_prods
# .isnull() returns a bool, is either 1 or 0. 
# Adds the 1s and 0s together to see the total count of nulls.
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [16]:
# Create a subset containing only these 16 values
df_nan = df_prods[df_prods['product_name'].isnull() == True]
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [17]:
df_prods.shape

(49693, 5)

In [18]:
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]
df_prods_clean.shape

(49677, 5)

##### Check for duplicates

In [19]:
# Create a new subset that contains only duplicated rows
df_dups = df_prods_clean[df_prods_clean.duplicated()]
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [20]:
df_prods_clean.shape

(49677, 5)

In [23]:
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()
df_prods_clean_no_dups.shape

(49672, 5)

##### Check for mixed-type data

In [24]:
# Check for mixed data types

for col in df_prods.columns.tolist():
    # weird is a bool that is either 1 or 0. 
    # If anomaly is indeed detected then weird = 1, print the column name.
    weird = (df_prods[[col]].map(type) != df_prods[[col]].iloc[0].apply(type)).any(axis = 1)
    if len(df_prods[weird]) > 0:
        print(col)

product_name


In [25]:
# turn all values to string
df_prods['product_name'] = df_prods['product_name'].astype('str')

##### Export as .csv

In [27]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index = False)

#### 2.2 For Orders.csv

##### An overview

In [28]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


The min and max values look just fine.
However, this extra column "Unnamed: 0" seemed to be completely off.
A quick look at the head and tail of this dataframe below suggests this column "Unnamed: 0" is a duplicate of the index python generated.

In [19]:
df_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
1,1,2398795,1,prior,2,3,7,15.0
2,2,473747,1,prior,3,3,12,21.0
3,3,2254736,1,prior,4,4,7,29.0
4,4,431534,1,prior,5,4,15,28.0


In [20]:
df_ords.tail()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
3421078,3421078,2266710,206209,prior,10,5,18,29.0
3421079,3421079,1854736,206209,prior,11,4,10,30.0
3421080,3421080,626363,206209,prior,12,1,12,18.0
3421081,3421081,2977660,206209,prior,13,1,12,7.0
3421082,3421082,272231,206209,train,14,6,14,30.0


In [21]:
# Delete this column "Unnamed: 0"
df_ords.drop('Unnamed: 0', axis=1, inplace=True)
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


##### Check and fix mixed-type data

In [22]:
# Check for mixed data types
# mixed_dtype is a bool that is either 1 or 0. 
# If anomaly is indeed detected then mixed_dtype = 1, print the column name.
for col in df_ords.columns.tolist():
    mixed_dtype = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type))
    if len(df_ords[mixed_dtype]) > 0:
        print(col)

order_id
user_id
eval_set
order_number
orders_day_of_week
order_hour_of_day
days_since_prior_order


In [23]:
# The script found mixed data types in the above columns.
# Fix it by assigning the single and primary data type to these columns. 
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['user_id'] = df_ords['user_id'].astype('str')
df_ords['eval_set'] = df_ords['eval_set'].astype('str')
df_ords['order_number'] = df_ords['order_number'].astype('str')
df_ords['orders_day_of_week'] = df_ords['orders_day_of_week'].astype('int64')
df_ords['order_hour_of_day'] = df_ords['order_hour_of_day'].astype('int64')

# see Task 5 & 6 for reasons leaving the column "days_since_prior_order" as is.

##### Check for missing values

In [24]:
df_ords.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

All columns except "days_since_prior_order" has missing values. 
Since this column stores the interval between this and the last order, 
a big amount of null values could mean the below possibilities:
1. The order was a customer's first order;
2. The recurring customer used a new user_id to place an order.

The more probably explanation would be point 1, as all customer starts as a new customer. 
Checking the count of unique user_id will help us understand the total number of customers and from there we can compare. 

##### Address the missing values

In [25]:
# As suggested in Task 5, we check the count of total customers.
df_ords['user_id'].nunique()

206209

The count of total customers matches the number of null values in "days_since_prior_order". This confirmed that the nulls were not an error, but are expected.
If we want make the column's data type numeric, as requested in Task 4, a possible way is to replace all nulls with 0.
However, the min value of this column is already 0, which means some customers placed 2+ orders on the same day. Replacing nulls with 0 could bring confusion and inaccuracy.
We need to check the number of these customers.

In [26]:
# Create a subset that only includes customers who made 2+ order a day.
df_ords_0days = df_ords.loc[df_ords['days_since_prior_order'] == 0]
df_ords_0days

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
8,2295261,1,prior,9,1,16,0.0
43,2557754,4,prior,5,5,13,0.0
189,2148135,17,prior,30,6,15,0.0
296,965160,24,train,19,0,16,0.0
337,3162630,27,prior,24,2,15,0.0
...,...,...,...,...,...,...,...
3420980,4277,206206,prior,47,2,17,0.0
3420985,648225,206206,prior,52,0,19,0.0
3420991,2475308,206206,prior,58,0,15,0.0
3421001,1043943,206206,test,68,0,20,0.0


The aspect ratio of 67755:206209 is too high. If we replace the nulls with 0, it will mess up the data and the statistics. Also, the nulls served a purpose in pointing out information about the first order of every new customer. This could be very important because marketing will always have a focus on aquring new customers. As a result, we either need to find an alternative for null replacement or not do anything at all.

This conclusion bring us to leave the column "days_since_prior_order" with its mixed values as is. If any analysis should be conducted using this column, we should just create new dataframes to separate the nulls from the rest.

##### Check and address duplicates

In [27]:
df_ords_dups= df_ords[df_ords.duplicated()]
df_ords_dups

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


There are no more duplicated rows in the dataframe. Also, no duplicated columns remain because we removed it in task 2.

The duplicated column was already removed in Task 2.
In Achievement 4.4, we worked on the original dataframe "orders.csv". There wasn't this extra column "Unnamed: 0". So we could safely assume that this column came into place during the data importing at the beginning of this achievement, and we can go ahead delete this column.

##### Export as .csv

In [29]:
# Export .csv
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'), index = False)