# 4.4 Data Wrangling & Subsetting

## This script contains the following points:
#### 1. Import Resources
#### 2. Data Wrangling
#### 2.1. For orders.csv
#### 2.2. For departments.csv
#### 2.3. Further data wrangling for orders.csv

### 1. Importing Resources

In [164]:
# Import libs
import pandas as pd
import numpy as np
import os

In [165]:
# Store the main project's folder path as a string
path = r'C:\Users\zhoux\Documents\CF-Data Analyst Program\DA-Immersion\10-2023 Instacart Basket Analysis'

In [166]:
# Import the “orders.csv” and “products.csv” data set
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

### 2. Data wrangling

#### 2.1 For orders.csv

##### Drop columns

In [167]:
# Dropping eval_seet column from orders.csv
df_ords_2 = df_ords.drop(columns = ['eval_set'])

In [168]:
# spot NaN
df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

##### Rename columns

In [169]:
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [170]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


##### Change Data Types

In [171]:
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [172]:
df_ords['order_id'].dtype

dtype('O')

#### 2.2 For departments.csv

##### Transposing Data

In [173]:
# Import departments.csv
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [174]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [175]:
df_dep_t = df_dep.T
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [176]:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [177]:
# Take the first row as the new header
new_header = df_dep_t.iloc[0]
new_header

0    department
Name: department_id, dtype: object

In [178]:
df_dep_t_new = df_dep_t[1:]
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [179]:
# Set the column names to new_header
df_dep_t_new.columns = new_header
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


##### Create a Data Dictionary and use it to work with products.csv (in exercise)

In [180]:
data_dict = df_dep_t_new.to_dict('index')
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [181]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [182]:
print(data_dict.get('19'))

{'department': 'snacks'}


##### Subsetting

In [183]:
df_snacks = df_prods[df_prods['department_id'] == 19]
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [184]:
df_prods['department_id'] == 19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [185]:
df_prods[df_prods['department_id'] == 19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [186]:
df_snack_2 = df_prods.loc[df_prods['department_id'] == 19]
df_snack_3 = df_prods.loc[df_prods['department_id'].isin([17, 18, 19])]
df_snack_3.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
13,14,Fresh Scent Dishwasher Cleaner,74,17,6.5
14,15,Overnight Diapers Size 6,56,18,11.2
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9


### 2.3 Further data wrangling for orders.csv

##### Find a variable that isn't suited being numeric

In [187]:
df_ords.head() #look at existing columns to find a variable that isn't suited being numeric.

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [188]:
df_ords['user_id'] = df_ords['user_id'].astype('str')

##### Find a variable in df_ords with an unintuitive name

In [189]:
df_ords_user2customer = df_ords.rename(columns = {'user_id' : 'customer_id'}, inplace = False)
df_ords_user2customer

Unnamed: 0,order_id,customer_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


##### Find the busiest hour is for placing orders

In [190]:
# counts the frequency of each hour of the day.
df_ords['order_hour_of_day'].value_counts(dropna = False)
# 10am is the busiest hour for placing orders.

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

In [199]:
df_ords['orders_day_of_week'].value_counts(dropna = False)
# Monday is the busiest day for placing orders.

orders_day_of_week
0    600905
1    587478
2    467260
5    453368
6    448761
3    436972
4    426339
Name: count, dtype: int64

##### Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary

In [191]:
# A value 4 in the "department_id" means the produce department.
data_dict.get('4')

{'department': 'produce'}

##### The sales team wants to know more about breakfast item sales. Create a subset containing only relevant information

In [192]:
# The breakfast department's corresponding value is 14 in the "department_id".
df_breakfast = df_prods.loc[df_prods['department_id'] == 14] # Creates a subset just for the breakfast dep.
df_breakfast.head() # gives a snapshot of the new subset.

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


##### The sales team also like to see details about products that customers might use to throw dinner parties. Create a subset containing all items from these departments: alcohol, deli, beverages, and meat/seafood

In [193]:
df_dinner_parties = df_prods.loc[df_prods['department_id'].isin([5, 7, 12, 20])] #includes alcohol, deli, beverages, and meat / seafood.
df_dinner_parties.head() # gives a snapshot of the new subset.

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


##### Check the total count in df_dinner_parties

In [194]:
# Counts the rows of the last dataframe I created.
df_dinner_parties.shape # It has 7650 rows.

(7650, 5)

##### Something is off with the customer with a "user_id" of “1.” Extract all the information about this user

In [195]:
# Extracts all the information about the customer with a "user_id" of “1.”
# Changes order_number from numeric to string to exclude this column in the stats in Task 10.
df_ords['order_number'] = df_ords['order_number'].astype('str') 
df_uid_1= df_ords[df_ords['user_id'] == '1']
df_uid_1

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


##### Provide basic stats about this user’s behavior

In [196]:
# calculates the basic stats of the user with user_id == 1
df_uid_1.describe()

Unnamed: 0,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,10.0
mean,2.636364,10.090909,19.0
std,1.286291,3.477198,9.030811
min,1.0,7.0,0.0
25%,1.5,7.5,14.25
50%,3.0,8.0,19.5
75%,4.0,13.0,26.25
max,4.0,16.0,30.0


##### Export df_ords as .csv

In [200]:
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'))
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


##### Export as .csv

In [198]:
df_dep_t_new.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'))