# Contents
### 1. Import Libaries
### 2. Data Consistency Checks
### 3. Mixed Typed Data
### 4. Missing Values
### 5. Duplicates
### 6. Export DataFrame

# 1. Import Libaries

In [3]:
# import libaries
import pandas as pd
import numpy as np
import os

In [4]:
# Create a Path
path = r'/Users/wiltonngo/InstaCart Basket Analysis'

In [24]:
# import CSV using Path
df_prods = df = pd.read_csv(os.path.join(path,'Data','Original Data','products.csv'),index_col = False)

In [25]:
# Import orders_wrangled.csv: 
df_ords = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'orders_wrangled.csv'))

# 2. Data Consistency Checks

In [26]:
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


### The min and Max for Prices could be an error

In [27]:
df_prods[df_prods.prices == 99999]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


In [28]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


### You know that the "order_dow" column, for instance, can’t have a maximum larger than 7. This would be impossible as there are only 7 days in a week.You wouldn’t, for instance, expect to see a negative minimum value for "days_since_prior_order", as this would imply a negative number of days. Nor would you expect a maximum larger than 24 for "order_hour_of_day" as there are only 24 hours in a day.

## 3. Mixed Type Data

In [29]:
df_test = pd.DataFrame()

In [30]:
df_test['mix'] = ['a','b',1,True]

In [31]:
# Check for mixed data types:
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col, ' mixed')
  else: print(col, ' consistent')

Unnamed: 0  consistent
order_id  consistent
user_id  consistent
eval_set  consistent
order_number  consistent
order_day_of_week  consistent
order_hour_of_day  consistent
days_since_last_order  consistent


In [32]:
# Check for mixed data types:
for col in df_prods.columns.tolist():
  weird = (df_prods[[col]].applymap(type) != df_prods[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prods[weird]) > 0:
    print (col, ' mixed')
  else: print(col, ' consistent')

product_id  consistent
product_name  mixed
aisle_id  consistent
department_id  consistent
prices  consistent


# 4. Missing Values

In [33]:
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [34]:
df_nan = df_prods[df_prods['product_name'].isnull() == True]
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### Too address the missing value I would Remove or filter out the missing data.

In [35]:
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [36]:
df_prods_clean.shape

(49677, 5)

In [37]:
df_prods.shape

(49693, 5)

In [38]:
df_ords.isnull().sum()

Unnamed: 0                    0
order_id                      0
user_id                       0
eval_set                      0
order_number                  0
order_day_of_week             0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

In [40]:
df_ords_nan = df_ords[df_ords['days_since_last_order'].isnull() == True]
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
0,0,2539329,1,prior,1,2,8,
11,11,2168274,2,prior,1,2,11,
26,26,1374495,3,prior,1,1,14,
39,39,3343014,4,prior,1,6,11,
45,45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,prior,1,4,12,
3420934,3420934,3189322,206206,prior,1,3,18,
3421002,3421002,2166133,206207,prior,1,6,19,
3421019,3421019,2227043,206208,prior,1,1,15,


### The reason for missing value is because some of the customers are their first time purchasing. So they don't have pervious puchase history.

### Addressing missing Values for Orders

### I chose not to address the missing values because they are new customers do they don't have data for that

## Addressing missing Values for Products

In [44]:
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


### I chose to filter our the Missing values for the Proudct_name. Since they are strings that is the best method

# 5. Duplicates

In [45]:
# Create duplicates df:
df_dups = df_prods_clean[df_prods_clean.duplicated()]
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [46]:
# Creat dropped dups list 
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()
df_prods_clean_no_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [47]:
df_prods_clean_no_dups.shape

(49672, 5)

In [1]:
df_ords.head()

NameError: name 'df_ords' is not defined

In [49]:
# Step 7. Run a check for duplicate values in your df_ords data: 
df_ords_dups = df_ords[df_ords.duplicated()]
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_last_order


### No Duplicates Found

# Export Changes

In [51]:
df_ords.to_csv(os.path.join(path, 'Data','Prepared Data', 'orders_checked.csv'))

In [None]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))