## Correcting an Incorrect Value in the Instacart Data

### Contents
#### Importing Libraries
#### Importing Data
#### Fixing Incorrect Value
#### Exporting Data

### Importing Data

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

### Importing Data

In [3]:
# Define pathway to relevant folder
path = r'/Users/sydneyjohnson/Documents/CF Data Analytics Course/07-2024 Instacart Basket Analysis'

In [5]:
# Import ords_prods_merged.pkl
df = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'combo_7_25.pkl'))

### Fixing Incorrect Value

In [7]:
# Identify Max value
max_price = df['prices'].max()
print(max_price)

14900.0


In [9]:
# Create subset of max value
filtered_df = df[df['prices'] == 14900]

print(filtered_df)

          order_id  user_id  order_number  orders_day_of_week  \
1576        912404       17            12                   2   
1638        603376       17            22                   6   
16522      3264360      135             2                   2   
16528       892534      135             3                   0   
53672       229704      342             8                   1   
...            ...      ...           ...                 ...   
32320511   3172853   205650            18                   1   
32347717   2504315   205818             3                   5   
32347727   1108388   205818             5                   4   
32380527   1916142   206049             1                   2   
32380551    379732   206049             4                   1   

          order_hour_placed  days_since_prior_order  product_id  \
1576                     14                     5.0       21553   
1638                     16                     4.0       21553   
16522             

In [11]:
filtered_df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,max_order,loyalty_flag,avg_price,spending_flag,med_days_prior,frequency_flag
1576,912404,17,12,2,14,5.0,21553,5,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,40,Regular customer,108.648299,High Spender,5.0,Frequent customer
1638,603376,17,22,6,16,4.0,21553,3,1,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,40,Regular customer,108.648299,High Spender,5.0,Frequent customer
16522,3264360,135,2,2,21,13.0,21553,6,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,4,New customer,1154.792308,High Spender,12.0,Regular customer
16528,892534,135,3,0,8,12.0,21553,3,1,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,4,New customer,1154.792308,High Spender,12.0,Regular customer
53672,229704,342,8,1,19,30.0,21553,9,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,16,Regular customer,114.426619,High Spender,23.0,Non-frequent customer


In [13]:
filtered_df.tail()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,max_order,loyalty_flag,avg_price,spending_flag,med_days_prior,frequency_flag
32320511,3172853,205650,18,1,9,7.0,21553,17,1,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,25,Regular customer,351.141618,High Spender,6.0,Frequent customer
32347717,2504315,205818,3,5,15,3.0,21553,13,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,25,Regular customer,170.07377,High Spender,13.0,Regular customer
32347727,1108388,205818,5,4,5,1.0,21553,5,1,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,25,Regular customer,170.07377,High Spender,13.0,Regular customer
32380527,1916142,206049,1,2,17,,21553,2,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,5,New customer,938.03125,High Spender,7.0,Frequent customer
32380551,379732,206049,4,1,14,5.0,21553,4,1,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,5,New customer,938.03125,High Spender,7.0,Frequent customer


In [15]:
# Through research, I found that a gallon of Lowfat 2% Milkfat Cottage Cheese usually costs $4. I will use that value instead of the mean
# Replace 14900 with 4 in the 'prices' column
df['prices'].replace(14900.0, 4, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['prices'].replace(14900.0, 4, inplace=True)


In [19]:
df['prices'] = df['prices'].replace(14900, 4)

In [21]:
filtered_df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,max_order,loyalty_flag,avg_price,spending_flag,med_days_prior,frequency_flag
1576,912404,17,12,2,14,5.0,21553,5,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,40,Regular customer,108.648299,High Spender,5.0,Frequent customer
1638,603376,17,22,6,16,4.0,21553,3,1,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,40,Regular customer,108.648299,High Spender,5.0,Frequent customer
16522,3264360,135,2,2,21,13.0,21553,6,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,4,New customer,1154.792308,High Spender,12.0,Regular customer
16528,892534,135,3,0,8,12.0,21553,3,1,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,4,New customer,1154.792308,High Spender,12.0,Regular customer
53672,229704,342,8,1,19,30.0,21553,9,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,16,Regular customer,114.426619,High Spender,23.0,Non-frequent customer


In [23]:
filtered_df = df[df['prices'] == 14900]
print(filtered_df)

Empty DataFrame
Columns: [order_id, user_id, order_number, orders_day_of_week, order_hour_placed, days_since_prior_order, product_id, add_to_cart_order, reordered, product_name, aisle_id, department_id, prices, _merge, max_order, loyalty_flag, avg_price, spending_flag, med_days_prior, frequency_flag]
Index: []


In [25]:
filtered_df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,max_order,loyalty_flag,avg_price,spending_flag,med_days_prior,frequency_flag


In [27]:

mean_price = df['prices'].mean()
print(mean_price)

7.790372752431982


In [29]:
# Identify Max value
max_price = df['prices'].max()
print(max_price)

25.0


### Exporting Data

In [31]:
df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'combo_7_25_v2.pkl'))