# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os

# Importing data

In [2]:
# assigning path for easier import of data
path = r'C:\Users\magia\06-2025 Instacart Basket Analysis'

In [3]:
# importing products file with path variable and variable list
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
# importing orders file with path variable and variable list
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

# Finding and handling mixed data types

In [5]:
# Create a dataframe
df_test = pd.DataFrame()

In [6]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [7]:
df_test

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [8]:
for col in df_test.columns.tolist():
    weird = df_test[col].map(type) != type(df_test[col].iloc[0])
    if len (df_test[weird]) > 0:
        print(col)

mix


In [9]:
df_test['mix'] = df_test['mix'].astype('str')

In [10]:
df_test['mix'].dtype

dtype('O')

# Finding Missing Values

In [11]:
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [12]:
# it isn't helpful to just check if there are missing values, when dealing with a large data set
df_prods.isnull()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
49688,False,False,False,False,False
49689,False,False,False,False,False
49690,False,False,False,False,False
49691,False,False,False,False,False


In [13]:
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [14]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [15]:
df_prods.shape

(49693, 5)

In [16]:
# Creating subset of non-missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() ==False]

In [17]:
# Checking shape of subset, comparing to original dataframe
df_prods_clean.shape

(49677, 5)

 # Finding and handling duplicates

In [18]:
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [19]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [20]:
df_prods_clean.shape

(49677, 5)

In [21]:
# Creating subset of non-duplicate values
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [22]:
# Checking shape of subset, comparing to original dataframe
df_prods_clean_no_dups.shape

(49672, 5)

In [23]:
# exporting final product data set
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

# Exercise Task on orders data

In [24]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_between_orders
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


Looks like the data is in good shape — no negative values or out-of-range entries (e.g. no days of week > 6 or < 0 or hours > 23 or <0).

The days_between_orders column is particularly interesting:

Mean is ~11 days, but 50% of values are ≤7, indicating a right-skewed distribution

Standard deviation is ~9, meaning there’s a wide spread, with most values falling between 2 and 20 days.


In [25]:
# Checking for mixed data types
for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_ords[weird]) > 0:
        print (col)
    else:
        print('no mixed data type')

no mixed data type
no mixed data type
no mixed data type
no mixed data type
no mixed data type
no mixed data type
no mixed data type


# checking for mixed typed data in orders
for col in df_ords.columns:
    weird = df_ords[col].map(type) != type(df_ords[col].iloc[0])
    if weird.any():
        print(f"{col} — mismatched rows:")
        print(df_ords.loc[weird, col].head())
        print()

Thoughts/Next steps: 
Order_id should not be relevant for analysis, so we don't need to fix for consistency.
Unnamed is an old index so can be dropped. 
The rest of the columns will be fixed for data type consistency.

In [26]:
# dropping unnamed column
df_ords.drop(columns=['Unnamed: 0'], inplace=True)

In [27]:
# fixing data type in user_id column
df_ords['user_id'] = df_ords['user_id'].astype('int64')

In [28]:
# checking data type after fixing
df_ords['user_id'].dtype

dtype('int64')

In [29]:
# fixing data type in order_number column
df_ords['order_number'] = df_ords['order_number'].astype('int64')

In [30]:
# checking data type after fixing
df_ords['order_number'].dtype

dtype('int64')

In [31]:
# fixing data type in orders_day_of_week  column
df_ords['orders_day_of_week'] = df_ords['orders_day_of_week'].astype('int64')

In [32]:
# checking data type after fixing
df_ords['orders_day_of_week'].dtype

dtype('int64')

In [33]:
# fixing data type in days_between_orders  column
df_ords['days_between_orders'] = df_ords['days_between_orders'].astype('float64')

In [34]:
# checking data type after fixing
df_ords['days_between_orders'].dtype

dtype('float64')

In [35]:
# checking for mixed data type

for col in df_ords.columns.tolist():
    if len(df_ords[col].apply(type).unique())>1:
        print(col+' have mix datatype')
    else:
        print(col+' don\'t have mix datatype')

order_id don't have mix datatype
user_id don't have mix datatype
order_number don't have mix datatype
orders_day_of_week don't have mix datatype
order_hour_of_day don't have mix datatype
days_between_orders don't have mix datatype


# Handling Missing Data

In [36]:
# finding missing data
df_nan = df_ords[df_ords['days_between_orders'].isnull() == True]
df_nan.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_between_orders
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,


In [38]:
# creating boolean new column  "new_customer" and fill it with TRUE if days_between_orders is NaN and FALSE if not
df_ords_clean = df_ords
df_ords_clean['new_customer'] = df_ords['days_between_orders'].isnull() == True
df_ords_clean

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_between_orders,new_customer
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False
...,...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0,False
3421079,1854736,206209,11,4,10,30.0,False
3421080,626363,206209,12,1,12,18.0,False
3421081,2977660,206209,13,1,12,7.0,False


# Checking and handling duplicates

In [39]:
df_dups = df_ords[df_ords.duplicated()]
df_dups.duplicated().sum()

0

Looks like we have no duplicates in our orders file.

In [40]:
# exporting final orders data set
df_ords_clean.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_clean.csv'))