# 2. Data Wrangling

#### Import libraries and dataframe
#### Dropping Columns
#### Renaming Columns
#### Transposing Dataframes
#### Creating Data Dictionaries
#### Creating Subset Data
#### Performing Value Counts

Importing Libraries and Dataframes

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path = r'C:\Users\Neena Tilton\Dropbox\Projects\01_2020_InstacartBasket'

In [3]:
df_orders = pd.read_csv(os.path.join(path,'02_Data', 'OriginalData', 'orders.csv'), index_col = False) 

In [4]:
df_products = pd.read_csv(os.path.join(path,'02_Data', 'OriginalData', 'products.csv'), index_col = False) 

Dropping unneccessary columns from dataframe:

In [6]:
df_orders = df_orders.drop(columns = ['eval_set'])

In [7]:
df_orders.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


Renaming column 'order_dow' to something more descriptive: 

In [9]:
df_orders.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

Transposing the 'departments' dataframe:

In [12]:
# Importing 'departments.csv' data
df_dptmt = pd.read_csv(os.path.join(path,'02_Data', 'OriginalData', 'departments.csv'), index_col = False)

In [13]:
# Transpose the wide format to long format.
df_dptmt.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [14]:
df_dptmt.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [15]:
# Overwrite the dataframe itself.
df_dptmt_t = df_dptmt.T

In [16]:
df_dptmt_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [17]:
df_dptmt_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


Removing the first row and replacing with the proper header:

In [18]:
# Take the first row of df_dptmt_t for the header
new_header = df_dptmt_t.iloc[0]

In [19]:
print(new_header)

0    department
Name: department_id, dtype: object


In [20]:
df_dptmt_t_new = df_dptmt_t[1:]

In [21]:
df_dptmt_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [22]:
df_dptmt_t_new.columns = new_header

In [23]:
df_dptmt_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


Creating a data dictionary for deparment ID and department names: 

In [24]:
data_dict = df_dptmt_t_new.to_dict('index')

In [25]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [26]:
df_products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [27]:
# To look up what department 19 is, use data dictionary.
print(data_dict.get('19'))

{'department': 'snacks'}


Creating a subset showing products only belonging to department ID 19:

In [28]:
# Creating a subset of product names only belonging to department_id 19.
df_snacks = df_products[df_products['department_id'] == 19]

In [29]:
df_snacks

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


Performing a value count of total orders made for each hour of day:  

In [36]:
# Using count function to see frequency of orders made every order time. 
df_orders['order_time_of_day'].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_time_of_day, dtype: int64

The busiest hour of the day for orders is 10am. In general, the busiest time frame for orders is 10am-3pm. 

Creating a subset of only breakfast items by showing only products belonging to department ID 14: 

In [40]:
# Creating a subset of products only belonging to the 'breakfast' department. 
df_products_breakfast = df_products[df_products['department_id'] == 14]

In [41]:
df_products_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


Creating a subset of items related to dinner parties by showing products from alcohol, deli, beverages, and meat/seafood departments:

In [42]:
# Creating a subset of products belonging only to relevant deparments. 
df_dinnerparty = df_products[df_products['department_id'].isin([5,20,7,12])]

In [43]:
# Performing value_count() to see if the only needed department_id's are in this new dataframe.  
df_dinnerparty['department_id'].value_counts(dropna = False)

7     4365
20    1322
5     1056
12     907
Name: department_id, dtype: int64

In [44]:
# Confirming the included department_id numbers are in fact for the department names I need for this subset dataframe. 

print(data_dict.get('7'))
print(data_dict.get('20'))
print(data_dict.get('5'))
print(data_dict.get('12'))

{'department': 'beverages'}
{'department': 'deli'}
{'department': 'alcohol'}
{'department': 'meat seafood'}


In [56]:
# Confirming all other columns are included in each observations in this new subset data frame. 
df_dinnerparty.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


Exporting df_orders dataframe as "orders_wrangled.csv" in PreparedData folder.

In [73]:
df_orders.to_csv(os.path.join(path, '02_Data', 'PreparedData', 'orders_wrangled.csv'))

Exporting df_dptmt_t_new dataframe as "departments_wrangled.csv" in PreparedData folder. 

In [74]:
df_dptmt_t_new.to_csv(os.path.join(path, '02_Data', 'PreparedData', 'departments_wrangled.csv'))