# 4.5 - Data Consistency Checks

## This scripts contain the following points:
### 1. Importing Libraries
### 2. Importing Data
### 3. Mixed-Type Data
### 4. Missing Values
### 5. Duplicates
### 6. Task 4.5 
### 7. Exporting Dataframe

# Importing Libraries

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

# Importing Data

In [2]:
# Create path to main folder
path = r'C:\Users\nguye\OneDrive\02-2022 Instacart Basket Analysis'

In [3]:
# Import datasets
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

# Mixed-Type Data

In [4]:
# Create a dataframe
df_test = pd.DataFrame()

In [5]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [6]:
# Checks for mixed-type columns
for col in df_test.columns.tolist():
    weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_test[weird]) > 0:
        print (col)

mix


In [7]:
# Change the data type of dataframe
df_test['mix'] = df_test['mix'].astype('str')

# Missing Values

In [8]:
# Find missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [9]:
# Create a dataframe to store missing values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [10]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [11]:
# Check the shape of dataframe
df_prods.shape

(49693, 5)

In [12]:
# Remove missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [13]:
# Check the shape of dataframe
df_prods_clean.shape

(49677, 5)

# Duplicates

In [14]:
# Search for duplicates in dataframe
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [15]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [16]:
# Check shape of dataframe
df_prods_clean.shape

(49677, 5)

In [17]:
# Create new dataframe with no duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [18]:
# Verify that duplicates were removed
df_prods_clean_no_dups.shape

(49672, 5)

# Task 4.5 - Data Consistency Checks

## Question 2

In [19]:
# Check descriptive statistics for df_ords dataframe
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


### One issue to note is the "unnamed" column. This column gives no meaningful data and will be removed. It was added as an index during one of the previous steps. All other columns show no inconsistencies. 

In [20]:
# Remove Unnamed column
df_ords = df_ords.drop(columns = ['Unnamed: 0'])

## Question 3

In [21]:
# Check for mixed-typed data
for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_ords[weird]) > 0:
        print (col)

### No mixed-type data was found in the dataframe

## Question 5

In [22]:
# Check for missing values
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [23]:
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [24]:
df_ords_nan

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


In [25]:
df_ords_nan.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,206209.0,206209.0,206209.0,206209.0,206209.0,0.0
mean,1708462.0,103105.0,1.0,2.754118,13.626597,
std,988129.9,59527.555167,0.0,2.076205,4.223769,
min,20.0,1.0,1.0,0.0,0.0,
25%,850730.0,51553.0,1.0,1.0,11.0,
50%,1706246.0,103105.0,1.0,3.0,14.0,
75%,2564292.0,154657.0,1.0,5.0,17.0,
max,3421081.0,206209.0,1.0,6.0,23.0,


### The column "days_since_prior_order" contains 206,209 NaN values. This is attributed to the fact that this is the user's only purchase and therefore there can be no value for "days_since_prior_order.

## Question 6

### These missing values are significant in the fact that there can be no logical value inputed instead, as these are the user's first purchase. Once they make another purchase, than the values will change, but in this case, the missing values will be left as is. 

## Question 7

In [26]:
# Find duplicates in df_ords
df_ords_dups = df_ords_nan[df_ords_nan.duplicated()]

In [27]:
df_ords_dups

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


### No duplicates were found in the dataframe

## Question 8

### As no duplicates were found in the dataframe, no action was necessary

# Exporting Dataframes

In [30]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'))
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'))