# 01. Import libraries and data

In [52]:
# Import libraries
import pandas as pd
import numpy as np
import os
# Define path
path = r'/home/scruffy/anaconda_projects/Instacart Basket Analysis/'
# Import data to dataframes
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'))
# Import data to dataframe
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))
df_ords_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders_products_prior.csv'))

In [None]:
df_dups = df_prods[df_prods['product_id'].duplicated()]
print(df_dups.shape)

# 02. Addressing mixed-type columns

In [8]:
# Create test dataframe
df_test = pd.DataFrame()

# Created mixed-type column
df_test['Mix'] = ['a', 'b', 1, True]

# Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

# Correct data type for df_test
df_test['Mix'] = df_test['Mix'].astype('str')
print(df_test['Mix'].dtype)

Mix
object


# 03. Handling missing values

In [54]:
# Count total number of observations with null values in products.csv
df_ords_prods.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

In [11]:
# Subset products.csv to only entries with missing product names
df_prods_nan = df_prods[df_prods['product_name'].isnull()==True]
print(df_prods_nan)

       product_id product_name  aisle_id  department_id  prices
33             34          NaN       121             14    12.2
68             69          NaN        26              7    11.8
115           116          NaN        93              3    10.8
261           262          NaN       110             13    12.1
525           525          NaN       109             11     1.2
1511         1511          NaN        84             16    14.3
1780         1780          NaN       126             11    12.3
2240         2240          NaN        52              1    14.2
2586         2586          NaN       104             13    12.4
3159         3159          NaN       126             11    13.1
3230         3230          NaN       120             16    14.4
3736         3736          NaN        41              8    14.8
4283         4283          NaN        77              7    14.4
4790         4790          NaN        91             16    14.5
38187       38183          NaN        39

In [12]:
# Subset products.csv to exclude entries with missing product names
df_prods_clean = df_prods[df_prods['product_name'].isnull()==False]

# 04. Handling duplicates

In [58]:
# Identify entries with identical values in every column
df_dups = df_ords_prods[df_ords_prods.duplicated()]

# Check shape of data before dropping duplicates
print(df_dups.shape)

# Drop duplicates
ords_prods_no_dups = df_ords_prods.drop_duplicates()

#Check shape of data after dropping duplicates
print(ords_prods_no_dups.shape)

(0, 4)
(32434489, 4)


# 05. Task Step 2

In [16]:
print(df_ords.describe())

         Unnamed: 0      order_id       user_id  order_number     order_dow  \
count  3.421083e+06  3.421083e+06  3.421083e+06  3.421083e+06  3.421083e+06   
mean   1.710541e+06  1.710542e+06  1.029782e+05  1.715486e+01  2.776219e+00   
std    9.875817e+05  9.875817e+05  5.953372e+04  1.773316e+01  2.046829e+00   
min    0.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  0.000000e+00   
25%    8.552705e+05  8.552715e+05  5.139400e+04  5.000000e+00  1.000000e+00   
50%    1.710541e+06  1.710542e+06  1.026890e+05  1.100000e+01  3.000000e+00   
75%    2.565812e+06  2.565812e+06  1.543850e+05  2.300000e+01  5.000000e+00   
max    3.421082e+06  3.421083e+06  2.062090e+05  1.000000e+02  6.000000e+00   

       order_hour_of_day  days_since_prior_order  
count       3.421083e+06            3.214874e+06  
mean        1.345202e+01            1.111484e+01  
std         4.226088e+00            9.206737e+00  
min         0.000000e+00            0.000000e+00  
25%         1.000000e+01         

## Descriptive stats for this dataframe look good. There seem to be fewer observations in the days_since_prior_order column than the others, but this is likely fine as obviously not all orders will have a prior order. We will examine this further when looking at missing values. Minimum and maximum values are all reasonable, although the fact that the maximum value for order_number is exactly 100 could indicate that this column is capped, which might also be reasonable as the vast majority of values are well below 100.

## We can see that the file was exported with the index. We don't need this so I will drop it.

In [19]:
# Drop redundant index column
df_ords.drop(columns=['Unnamed: 0'], inplace = True)
print(df_ords.head())

   order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0   2539329        1    prior             1          2                  8   
1   2398795        1    prior             2          3                  7   
2    473747        1    prior             3          3                 12   
3   2254736        1    prior             4          4                  7   
4    431534        1    prior             5          4                 15   

   days_since_prior_order  
0                     NaN  
1                    15.0  
2                    21.0  
3                    29.0  
4                    28.0  


# 06. Task Step 3

In [60]:
# Identify mixed-type columns
for col in df_ords_prods.columns.tolist():
  weird = (df_ords_prods[[col]].map(type) != df_ords_prods[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords_prods[weird]) > 0:
    print (col)

## Found no mixed-type columns in orders_wrangled.csv

# 07. Task Step 5

In [24]:
# Identify missing values in df_ords
df_ords.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

## Found 206209 missing observations in the days_since_prior_order column. 206209 is also the exact value of the largest user_id number, which suggests that every one of these null values is the first order by each user. No other missing values found.

# 08. Task Step 6

In [27]:
# Create flag column for missing days_since_prior_order
df_ords['no_prior_orders'] = df_ords['days_since_prior_order'].isnull()==True
print(df_ords.head())

   order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0   2539329        1    prior             1          2                  8   
1   2398795        1    prior             2          3                  7   
2    473747        1    prior             3          3                 12   
3   2254736        1    prior             4          4                  7   
4    431534        1    prior             5          4                 15   

   days_since_prior_order  no_prior_orders  
0                     NaN             True  
1                    15.0            False  
2                    21.0            False  
3                    29.0            False  
4                    28.0            False  


## Created a flag column to identify rows with null values in days_since_prior_order because these rows should not be excluded from analysis just because the user has no prior orders. This allows us to avoid any potential errors that could result from null values while retaining all important data.

# 09. Task Step 7

In [30]:
# Identify duplicated rows
print(df_ords[df_ords.duplicated()].shape)

(0, 8)


## No duplicates found.

In [32]:
print(df_prods_clean_no_dups.describe())

         product_id      aisle_id  department_id        prices
count  49672.000000  49672.000000   49672.000000  49672.000000
mean   24850.349775     67.762442      11.728942      9.993282
std    14340.705287     38.315784       5.850779    453.615536
min        1.000000      1.000000       1.000000      1.000000
25%    12432.750000     35.000000       7.000000      4.100000
50%    24850.500000     69.000000      13.000000      7.100000
75%    37268.250000    100.000000      17.000000     11.100000
max    49688.000000    134.000000      21.000000  99999.000000


In [33]:
df_prods_sorted = df_prods_clean_no_dups.sort_values(by='prices', ascending=False)
print(df_prods_sorted.head(20))

       product_id                      product_name  aisle_id  department_id  \
33666       33664             2 % Reduced Fat  Milk        84             16   
21554       21553  Lowfat 2% Milkfat Cottage Cheese       108             16   
19392       19391         Turkey Breast Tenderloins        49             12   
25580       25579     Naturally Smoked Trout Fillet        15             12   
40490       40486                   Chicken Tenders        49             12   
21468       21467            Wild Caught Raw Shrimp        15             12   
9020         9020  Boneless Skinless Chicken Thighs        35             12   
9896         9896    Uncured Applewood Smoked Bacon       106             12   
14207       14207                  Angus Roast Beef         7             12   
41097       41093          Sugar Free Dry Rub Bacon       106             12   
39050       39046           Smok Cured Turkey Bacon       106             12   
36577       36573                    Pep

In [34]:
df_prods_clean_no_dups.loc[df_prods_clean_no_dups['prices'] >100, 'prices'] = np.nan
print(df_prods_clean_no_dups.describe())

         product_id      aisle_id  department_id        prices
count  49672.000000  49672.000000   49672.000000  49670.000000
mean   24850.349775     67.762442      11.728942      7.680437
std    14340.705287     38.315784       5.850779      4.199381
min        1.000000      1.000000       1.000000      1.000000
25%    12432.750000     35.000000       7.000000      4.100000
50%    24850.500000     69.000000      13.000000      7.100000
75%    37268.250000    100.000000      17.000000     11.100000
max    49688.000000    134.000000      21.000000     25.000000


In [50]:
print(df_prods_clean_no_dups.shape)
print(df_ords.shape)

(49672, 5)
(3421083, 8)


# 10. Task Step 9

In [49]:
# Export data
#df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'), index=False)
#df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked_corrected.csv'), index=False)