### Python Fundamentals for Data Analysts

# 4.4: Data Wrangling & Subsetting

### Content

1. Data wrangling
- Change Data Type
- Rename Column
- Transpose Dataframe

2. Extract Info
- data_dict.get('4')
- df_order['orders_day_of_week'].value_counts(dropna=False)

3. Subsetting data
- df_breakfast =  df_product[df_product['department_id']==14]
- df_party = df_product.loc[df_product['department_id'].isin([5,7,12,20])]
- df_user1 = df_order[df_order['user_id']== '1']

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Set path for environment
path = r'C:\Users\tsoew\OneDrive\Desktop\InstaCart Basket Analysis'

In [3]:
# Loading products.csv
df_product = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
# Loading departments.csv
df_dept = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'departments.csv'), index_col = False)

In [5]:
# Loading orders.csv minus eval_set column
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']
df_order = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'orders.csv'), usecols = vars_list)

### Perform Data Wrangling 

In [6]:
# Step 1: Change Data Type
df_order['order_id'] = df_order['order_id'].astype('str')
df_order['user_id'] = df_order['user_id'].astype('str')

In [7]:
# Show stat analysis
df_order.describe()

Unnamed: 0,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,17.15486,2.776219,13.45202,11.11484
std,17.73316,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


In [8]:
# Print dataframe dimension
df_order.shape

(3421083, 6)

**Note**: This means dataframe has 3,421,083 rows and 6 columns

In [9]:
# Step 2: Rename a column
df_order.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)
df_order.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [10]:
# Step 3: Transpose the department dataframe
df_dept_t = df_dept.T
df_dept_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [11]:
# Try to remove the '0' in the first row by performing the following steps

# Copy the header
new_header = df_dept_t.iloc[0]
new_header

0    department
Name: department_id, dtype: object

In [12]:
# Copy the rest of dataframe from row 2 to the end
df_dept_new = df_dept_t[1:]

# The mysterious step of assigning new header to the new dataframe
df_dept_new.columns = new_header

# The to_dict() function you just used transformed your df_dep_t_new dataframe 
# into dictionary format and saved it in a new variable, data_dict. 
# The argument, index, tells Python to use the numbered rows as the key values 
# for the entries in the dictionary.

data_dict = df_dept_new.to_dict ('index')


In [13]:
# data_dict

### How to extract info from dataframe

In [14]:
# Perform value counts to know the frequency of each unique data in the column
# df_order['order_hour_of_day'].value_counts(dropna=False)

In [15]:
# Extract information from Data Dictionary
data_dict.get('4')

{'department': 'produce'}

In [16]:
# Get frequency for this column
# df_order['days_since_prior_order'].value_counts(dropna=False)

In [17]:
# Get frequency from this column
df_order['orders_day_of_week'].value_counts(dropna=False)

orders_day_of_week
0    600905
1    587478
2    467260
5    453368
6    448761
3    436972
4    426339
Name: count, dtype: int64

In [18]:
# Get frequency from this column
# df_order['user_id'].value_counts(dropna=False)

In [19]:
df_product.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


### Subsetting data

In [20]:
# Subsetting dataframe for Breakfast department using index
df_breakfast =  df_product[df_product['department_id']==14]
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [21]:
# Subsetting dataframe for Snacks department using isin method
df_snacks = df_product.loc[df_product['department_id'].isin([19])]
df_snacks

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [22]:
# Extract items from the following departments: 5,7,12,20 for party related items
df_party = df_product.loc[df_product['department_id'].isin([5,7,12,20])]
df_party

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


In [23]:
# Extract information from customer with user_id = 1
df_user1 = df_order[df_order['user_id']== '1']
df_user1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [24]:
# Get some behavior statistics from user_id = 1
df_user1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [25]:
# Export order dataframe to order_wrangled.csv
df_order.to_csv(os.path.join(path, 'Data','Prepared Data', 'orders_wrangled.csv'), index=False)

In [27]:
df_dept_new.head()

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol


In [30]:
# Export the new format of department dataframe
df_dept_new.to_csv(os.path.join(path, 'Data','Prepared Data', 'departments_wrangled.csv'))

# End