## Script Contents
### Check for mixed types
### Check for missing values
### Look for duplicates

In [1]:
# Importing libraries and data
import pandas as pd
import numpy as np
import os

In [2]:
path = r'C:\Users\anon\Documents\CareerFoundry\Data Analytics Immersion\Instacart Basket Analysis'

In [3]:
df_ords = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'orders.csv'), index_col = False)

In [4]:
df_prods = df = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
# Create a dataframe
df_test = pd.DataFrame()

In [6]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [7]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [8]:
# Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [9]:
# Change all values in mix column to strings
df_test['mix'] = df_test['mix'].astype('str')

In [10]:
# Check for missing values in 'products' data
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [11]:
# Assign missing product name values to a new dataframe
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [12]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [13]:
df_prods.shape

(49693, 5)

In [14]:
# Assign present product name values to a new dataframe
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [15]:
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [16]:
# Look for full duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [17]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [18]:
df_prods_clean.shape

(49677, 5)

In [19]:
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [20]:
df_prods_clean_no_dups.shape

(49672, 5)

In [21]:
df_prods_clean_no_dups.to_csv(os.path.join(path, 'Data','Prepared Data', 'products_checked.csv'))

# 2) Run the df.describe() function on df_ords, share in a markdown cell whether anything about the data looks off or should be investigated further.

In [22]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


### I am suprised at some of the max values for 'order_number' (100) and 'days_since_prior_order' (30). These values would lead me to ask questions of the client to clarify if their system caps records at 100 orders per customer and drops records of customers who do not place another order within 30 days. I assume the 'order_dow' column is likely zero indexed so that even though the max is 6, there are still 7 values.

# 3 and 4) Check for mixed-type data in your df_ords dataframe. If you find mixed-type data, fix it. 

In [23]:
# Check for mixed types
for col in df_ords.columns.tolist():
  mixedt = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[mixedt]) > 0:
    print (col)

### There do not appear to be any mixed type columns in the df_ords dataframe

# 5 and 6) Run a check for missing values in your df_ords dataframe. Address the missing values using an appropriate method.

In [24]:
df_ords.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

### I am curious as to whether the null values found in the 'days_since_prior_order' column indicate orders that have no prior orders. It appears that these null values correspond with an order number of 1 for the user id they are associated with.

### In this particular case, I would not make any edits to the data frame. The only potential option would be to use a flag indicating that the order is a first order, however that would most intuitively be done using a string, which could lead to issues with mixed types.

# 7 and 8) Run a check for duplicate values in your df_ords data. Address the duplicates using an appropriate method.

In [25]:
df_ords_dups = df_ords[df_ords.duplicated()]

In [26]:
df_ords_dups

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


### There are no duplicates found. I will therefore not be making any changes.

In [27]:
df_ords.to_csv(os.path.join(path, 'Data','Prepared Data', 'orders_checked.csv'))