# Table of content
## 1. Setting up
## 2. Column derivations for price
## 3. For loop
## 4. Column derivation for order frequency
## 5. Solving tasks 4.7

# 1. Setting up

In [1]:
# Import pandas, numpy and os

import pandas as pd
import numpy as np
import os

In [2]:
# creating path for later import/export

path = r"C:\Users\Anwender\Documents\07-2023 Instacart Basket Analysis\02 Data"

In [3]:
# importing df merged

df = pd.read_pickle(os.path.join(path, "Prepared Data", "df_merged.pkl"))

# 2. Column derivations for price

In [23]:
# limitting df to one million rows

df_subset = df[:1000000]

In [4]:
# defining new column derivate "price_label"

def price_label(row):

  if row["prices"] <= 5:
    return "Low-range product"
  elif (row["prices"] > 5) and (row["prices"] <= 15):
    return "Mid-range product"
  elif row["prices"] > 15:
    return "High-range product"
  else: return "Not enough data"

In [24]:
# applying definition to df_subset:

df_subset["price_range"] = df_subset.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['price_range'] = df_subset.apply(price_label, axis=1)


In [25]:
# checking for increased number of columns

df_subset.shape

(1000000, 19)

In [26]:
# checking to see output of definition price_range

df_subset["price_range"].value_counts(dropna=False)

Mid-range product    756450
Low-range product    243550
Name: price_range, dtype: int64

In [14]:
# checking for max price

df_subset["prices"].max()

14.8

In [15]:
# setting conditions with the loc function as an alternative to the price_range definition:

df_subset.loc[df["prices"]>15, "price_range_loc"] = "High-range product"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset.loc[df["prices"]>15, "price_range_loc"] = "High-range product"


In [16]:
df_subset.loc[(df["prices"]<=15) & (df_subset["prices"]>5), "price_range_loc"] = "Mid-range product"

In [17]:
df_subset.loc[df["prices"]<=5, "price_range_loc"] = "Low-range product"

In [18]:
# checking output

df_subset["price_range_loc"].value_counts(dropna = False)

Mid-range product    756450
Low-range product    243550
Name: price_range_loc, dtype: int64

In [5]:
# applying loc function to whole df

df.loc[df["prices"]>15, "price_range_loc"] = "High-range product"

In [6]:
df.loc[(df["prices"] <= 15) & (df["prices"] > 5), "price_range_loc"] = "Mid-range product"

In [7]:
df.loc[df["prices"]<=5, "price_range_loc"] = "Low-range product"

In [8]:
df["price_range_loc"].value_counts(dropna = False)

Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: price_range_loc, dtype: int64

# 3. For loop

In [9]:
# trying out for loops
for x in range (1980, 1996):
    print ("Millenials born in %d" % (x))

Millenials born in 1980
Millenials born in 1981
Millenials born in 1982
Millenials born in 1983
Millenials born in 1984
Millenials born in 1985
Millenials born in 1986
Millenials born in 1987
Millenials born in 1988
Millenials born in 1989
Millenials born in 1990
Millenials born in 1991
Millenials born in 1992
Millenials born in 1993
Millenials born in 1994
Millenials born in 1995


# 4. Column derivation for order frequency

In [10]:
# checking frequency of orders per weekday through "orders_day_of_week" column

df["orders_day_of_week"].value_counts(dropna = False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: orders_day_of_week, dtype: int64

In [11]:
# defining busy days by amount of orders per day for a summary

result = []

for value in df["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [12]:
# applying definition to df

df["busiest_day"] = result

In [13]:
# frequency check of busiest day

df["busiest_day"].value_counts(dropna = False)

Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: busiest_day, dtype: int64

# Solving tasks 4.7

### Step 2 changing busiest day to busiest days (plural) and slowest days

In [14]:
# defining busiest and slowest days with a loop for result_2

result_2 = []

for value in df["orders_day_of_week"]:
  if value == 0 or value == 1:
    result_2.append("Busiest days")
  elif value == 3 or value == 4:
    result_2.append("Least busy")
  else:
    result_2.append("Regularly busy")

In [15]:
# applying result_2 to the df

df["busiest_days"] = result_2

In [30]:
# checking for accuracy with a frequency table

df["busiest_days"].value_counts(dropna = False)

Regularly busy    12916111
Busiest days      11864412
Least busy         7624336
Name: busiest_days, dtype: int64

#### The summarized values add up to the frequency checks above.

### Step 4 Identifying busiest hours of the day

In [31]:
# listing the amounts of orders per hour with a frequency table

df["order_hour_of_day"].value_counts(dropna = False)

10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: order_hour_of_day, dtype: int64

In [17]:
# setting conditions with the if function to test on subset df:

def busiest_period_of_day(row):

  if (row["order_hour_of_day"] <= 5) and (row["order_hour_of_day"] >= 0):
    return "Fewest orders"
  elif (row["order_hour_of_day"] <= 15) and (row["order_hour_of_day"] >= 10):
    return "Most orders"
  else: return "Average orders"

In [33]:
# applying busy_hours to df_subset:

df_subset["busiest_period_of_day"] = df_subset.apply(busiest_period_of_day, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['busiest_period_of_day'] = df_subset.apply(busiest_period_of_day, axis=1)


In [34]:
# checking shape after application

df_subset.shape

(1000000, 16)

In [35]:
# checking frequency counts

df_subset["busiest_period_of_day"].value_counts(dropna=False)

Most orders       500305
Average orders    482954
Fewest orders      16741
Name: busiest_period_of_day, dtype: int64

In [36]:
# checking output

df_subset.head()

Unnamed: 0,order_id,user_id,amount_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Average orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Most orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Average orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Most orders


In [18]:
# applying busy_hours to df

df["busiest_period_of_day"] = df.apply(busiest_period_of_day, axis=1)

In [19]:
# checking output after application

df.head()

Unnamed: 0,order_id,user_id,amount_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy,Average orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy,Most orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy,Average orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy,Most orders


In [20]:
# checking shape 

df.shape

(32404859, 18)

In [21]:
# checking frequency counts

df["busiest_period_of_day"].value_counts(dropna=False)

Most orders       16128666
Average orders    15679865
Fewest orders       596328
Name: busiest_period_of_day, dtype: int64

In [28]:
# exporting file as df_derived.pkl

df.to_pickle(os.path.join(path, "Prepared Data", "df_derived.pkl"))