# 4.5 Data Consistency Checks

### 01. Importing libraries

In [None]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [None]:
# Import dataframes

path = r'C:\Users\walls\Documents\Coding\Data Analysis\CareerFoundry\Data Immersion A4\Instacart Basket Analysis 01-25'
df_prods = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'products.csv'), index_col = False)
df_ords = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [None]:
df_prods.head()

In [None]:
df_prods.shape

In [None]:
df_ords.head()

In [None]:
df_ords.shape

##### Observations: 
df_ords has a new column, "Unamed: 0"

### 02. Data Consistency Checks

#### Mixed Data

In [None]:
# Create a df

df_test = pd.DataFrame()

In [None]:
# Create mixed type column

df_test['mix'] = ['a', 'b', 1, True]

In [None]:
df_test.head()

In [None]:
# Check for mixed types

for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

In [None]:
# Convert to str

df_test['mix'] = df_test['mix'].astype('str')

In [None]:
df_test['mix'].dtype

#### Missing Values

In [None]:
# Check for missing values

df_prods.isnull().sum()

##### Observations: 
column product_name has 16 missing values

In [None]:
# Create subset of missing values for product_name column
df_nan = df_prods[df_prods['product_name'].isnull() == True ]

In [None]:
df_nan

In [None]:
# Check stats

df_prods.describe()

##### Observations:
1. product_name does not appear -- missing values aren't int but str (can ignore/void)
2. price column has a value of 99,999 for a product -- a mistake must be somewhere

In [None]:
df_prods.shape

In [None]:
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [None]:
df_prods_clean.shape

#### Duplicates

In [None]:
# Find duplicates 
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [None]:
df_dups

In [None]:
# Double checking df shape

df_prods_clean.shape

In [None]:
# Create new df without duplicates found above

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [None]:
df_prods_clean_no_dups.shape

## Task

In [None]:
df_ords.describe()

##### Observations: 
1. Checking the statistics on the df_ords shows no concerning values.

In [None]:
# Check for mixed dtypes

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

##### Observations:
1. No mixed columns were found.

In [None]:
# Check for missing values

df_ords.isnull().sum()

##### Observations: 
1.  206,209 missing values were found for 'days_since_prior_order' column
2.  206,209 is also the number of user_ids. 

# Find out the the missing values based on user_id count
df_ords[df_ords['days_since_prior_order'].isnull()].groupby('user_id').count().describe()

In [None]:
df_ords

##### Observations:
1.It is likely these missing values comes from the "0" variable, the first order of each user. 
2. Deleting the missing values doesn't seem necessary because all users have 1 in the days_since_prior_order column

In [None]:
# Check for duplicates
df_ords[df_ords.duplicated()]

##### Observations:
1. No duplicates found

In [None]:
# Drop "Unnamed: 0" column 
# Change df_ords name

df_ords_clean = df_ords.drop('Unnamed: 0', axis=1)

In [None]:
df_ords_clean

In [None]:
df_ords_clean.head()

#### Summary

1. Renamed orders_wrangled to df_ords
2. Cleaned df_prods -- now df_prods_clean_no_dups
3. Cleaned df_ords -- now df_ords_clean
4. Removed 16 missing values from product_name column
5. Removed 5 duplicates from df_prods 
6. df_ords_clean shape 3,421,083 rows and 6 columns
7. df_prods_clean_no_dups shape 49,672 rows and 5 columns

### 04. Exporting

In [None]:
# Export df

df_prods_clean_no_dups.to_csv(os.path.join(path, 'Data','Prepared Data', 'products_checked.csv'))

df_ords_clean.to_csv(os.path.join(path, 'Data','Prepared Data', 'orders_checked.csv'))