# Table of content
## 01. Importing libraries and dfs (setup)
## 02. Data wrangling
## 03. Creating a data dictionary 
## 04. solving data wrangling tasks

## 01. Importing libraries and dfs

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
path = r"C:\Users\Anwender\Documents\07-2023 Instacart Basket Analysis\02 Data\Original Data"

In [3]:
# import data sets products.csv and orders.csv
df_prods = pd.read_csv(os.path.join(path, "products.csv"), index_col = False)
df_ords = pd.read_csv(os.path.join(path, "orders.csv"), index_col = False)

## 02. Data wrangling

In [4]:
# dropping "eval_set" column from orders.csv
df_ords.drop(columns = ["eval_set"])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [5]:
# checking column "days_since_prior_order" for values
df_ords["days_since_prior_order"].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

In [6]:
# renaming column order_dow into orders_day_of_the_week
df_ords.rename(columns = {"order_dow" : "orders_day_of_week"}, inplace = True)

In [7]:
# checking output
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [8]:
# changing datatype of order_id column to string
df_ords["order_id"] = df_ords["order_id"].astype("str")

In [9]:
# checking output
df_ords["order_id"].dtype

dtype('O')

## 03. Creating a data dictionary

In [10]:
# import data set departments.csv
df_dep = pd.read_csv(os.path.join(path, "departments.csv"), index_col = False)

In [11]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [12]:
# Transposing departments.csv
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [13]:
# creating transposed version of departments.csv
df_dep_t = df_dep.T

In [14]:
# creating an index
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [16]:
#taking row 0 as a header for departments table
new_header = df_dep_t.iloc[0]

In [17]:
new_header

0    department
Name: department_id, dtype: object

In [18]:
# copying every row 1 to the end from df_dep_t into a new copy df_dep_t_new
df_dep_t_new = df_dep_t[1:]

In [19]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [20]:
# assign new column name to created table
df_dep_t_new.columns = new_header

In [21]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [22]:
# turning df_dep_t_new to a data dictionary:
data_dict = df_dep_t_new.to_dict("index")

In [23]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [24]:
print(data_dict.get("19"))

{'department': 'snacks'}


In [25]:
#creating subset 
df_snacks =  df_prods[df_prods["department_id"]==19]

In [26]:
#subset version 2 and 3
df_snacks_2 = df_prods.loc[df_prods["department_id"] == 19]
df_snacks_3 = df_prods.loc[df_prods["department_id"].isin([19])]

## 04. Data wrangling tasks

In [27]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [28]:
df_ords.dtypes

order_id                   object
user_id                     int64
eval_set                   object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [29]:
#Step 2 changing datatype of user_id column to string
df_ords["user_id"] = df_ords["user_id"].astype("str")

In [33]:
df_ords.dtypes

order_id                   object
user_id                    object
eval_set                   object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [34]:
print(type("user_id"))

<class 'str'>


In [36]:
#Step 3 changing name of column order_number to amount_of_orders
df_ords.rename(columns = {'order_number' : 'amount_of_orders'}, inplace = True)

In [37]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,amount_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [42]:
#Step 4 find busiest order hour /frequency of order_hour_of_day
df_ords["order_hour_of_day"].value_counts()

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

#### The busiest hour for ordering is at 10am.

In [44]:
#Step 5 Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.
print(data_dict.get("4"))

{'department': 'produce'}


In [50]:
print(df_prods.loc[df_prods["department_id"]==4])

       product_id                                      product_name  aisle_id  \
30             31                                White Pearl Onions       123   
42             43                               Organic Clementines       123   
44             45                                 European Cucumber        83   
65             66                         European Style Spring Mix       123   
88             89                    Yogurt Fruit Dip Sliced Apples       123   
...           ...                                               ...       ...   
49582       49578                                Black Garlic Bulbs       123   
49623       49619                                        Opo Squash        83   
49639       49635  Baby Food Blueberry, Parsnip & Buckwheat Stage 2        83   
49661       49657                                 Cabernet Tomatoes        83   
49687       49683                                    Cucumber Kirby        83   

       department_id  price

#### The value 4 means produce.

In [55]:
#Step 6 Creating subset df_breakfast for breakfast item sales
print(data_dict.get("14"))

{'department': 'breakfast'}


In [56]:
df_breakfast = df_prods.loc[df_prods["department_id"]==14]

In [66]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [63]:
#Step 7 Creating subset containing alcohol, deli, beverages, and meat/seafood
df_adbms = df_prods.loc[df_prods["department_id"].isin([5,20,7,12])]

In [65]:
df_adbms

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


In [67]:
#Step 8 How many rows does the last dataframe you created have?
#Answer: 7650 rows.
df_adbms.shape

(7650, 5)

In [79]:
#Step 9 Creating a subset where user_id is 1
df_user_1 = df_ords[df_ords["user_id"]=="1"]

In [80]:
df_user_1

Unnamed: 0,order_id,user_id,eval_set,amount_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [85]:
#Step 10 basic info on df_user_1
df_user_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 0 to 10
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   order_id                11 non-null     object 
 1   user_id                 11 non-null     object 
 2   eval_set                11 non-null     object 
 3   amount_of_orders        11 non-null     int64  
 4   orders_day_of_week      11 non-null     int64  
 5   order_hour_of_day       11 non-null     int64  
 6   days_since_prior_order  10 non-null     float64
dtypes: float64(1), int64(3), object(3)
memory usage: 704.0+ bytes


In [86]:
#statistics on df_user_1
df_user_1.describe()

Unnamed: 0,amount_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [91]:
#Step 11 exporting wrangled orders csv
path_2 = r"C:\Users\Anwender\Documents\07-2023 Instacart Basket Analysis\02 Data\Prepared Data"

In [93]:
df_ords.to_csv(os.path.join(path_2,"orders_wrangled.csv"))

In [94]:
#Step 12 exporting wrangled department csv
df_dep_t_new.to_csv(os.path.join(path_2,"departments_wrangled.csv"))