# 4.7 Deriving New Variables

### Points in this script:

##### 1. Imports
##### 2. Creating a Subset of Data
##### 3. Deriving a new Variable
##### 4. Using loc()
##### 5. For-Loop
##### 6. Changing labels for client
##### 7. Calculating busiest hour of day

### 1. Imports

In [None]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [None]:
# Import dataframes

path = r'C:\Users\walls\Documents\Coding\Data Analysis\CareerFoundry\Data Immersion A4\Instacart Basket Analysis 01-25'
df_op_merged = pd.read_pickle(os.path.join(path, 'Data' , 'Prepared Data' , 'ords_prods_merged.pkl'))

In [None]:
df_op_merged.head()

In [None]:
df_op_merged.shape

### 2. Create Subset

In [None]:
# Create a subset

df_opm_subset = df_op_merged[:1000000]

In [None]:
df_opm_subset.shape

In [None]:
df_opm_subset.head()

### 3. Deriving New Variable

In [None]:
# Create function called price_label that applies string label to every row

def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [None]:
# Create new column for function output

df_opm_subset['price_range'] = df_opm_subset.apply(price_label, axis=1)

In [None]:
df_opm_subset['price_range'].value_counts(dropna = False)

In [None]:
# Most expensive product

df_opm_subset['prices'].max()

##### Observations: 
1. The max value seems too high. Further investigation needed.

### 4. Using loc()

In [None]:
df_op_merged.loc[df_op_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [None]:
df_op_merged.loc[(df_op_merged['prices'] <= 15) & (df_opm_subset['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [None]:
df_op_merged.loc[df_op_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [None]:
df_op_merged['price_range_loc'].value_counts()

##### Observations:
1. Most customers seem to prefer to shop with budget friendly/low prices
2. Fewer customers shop higher priced products. 

### 5. For loop

In [None]:
# Finding frequency of weekly orders

df_op_merged['orders_day_of_week'].value_counts(dropna = False)

##### Observations: 
1. Sunday and Monday have the highest number of orders.
2. Friday and Saturday have the lowest number of orders.

In [None]:
# New row for busyness

result = []

for value in df_op_merged["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [None]:
result

In [None]:
# New column to combine df and result

df_op_merged['busiest_day'] = result

In [None]:
df_op_merged['busiest_day'].value_counts(dropna = False)

### 6. Client Changes for Labels

In [None]:
# Changing row for busyness

result = []

for value in df_op_merged["orders_day_of_week"]:
  if value <= 1:
    result.append("Busiest days")
  elif value == 3 or 4:
    result.append("Slowest days")
  else:
    result.append("Regularly busy")

In [None]:
df_op_merged['busiest_day'].value_counts(dropna = False)

#### Observations:
1. Regularly busy value_count decreased while Busiest days and Slowest days increased in totals.
2. Since these two rows now consist of two variables, it makes sense that they are greater in number.

### 7. Busiest Hour of Day

In [None]:
# Create subset 

busiest_hour_of_day = df_op_merged['order_hour_of_day'].value_counts()

In [None]:
# Getting stats for order_hour_of_day 

busiest_hour_of_day.describe()

In [None]:
# Getting frequency of order_hour_of_day

df_op_merged['order_hour_of_day'].value_counts()

In [None]:
# Create df.loc() to assign labels for order_hour_of_day

df_op_merged.loc[df_op_merged['order_hour_of_day'].between(8, 18), 'busiest_period_of_day'] = 'Most orders'

In [None]:
df_op_merged.loc[df_op_merged['order_hour_of_day'] == 19, 'busiest_period_of_day'] = 'Average orders'

In [None]:
df_op_merged.loc[(df_op_merged['order_hour_of_day'].between(0, 7)) | (df_op_merged['order_hour_of_day'].between(20, 23)), 'busiest_period_of_day'
] = 'Fewest orders'

In [None]:
# Check frequency

df_op_merged['busiest_period_of_day'].value_counts()

In [None]:
df_op_merged.shape

In [None]:
df_op_merged.head()

##### Summary
1. 3 new columns added to df_op_merged -- price_range_loc, busiest_day, and busiest_period_of_day
2. df_op_merged shape (32404859, 17)

### 8. Exports

In [None]:
# Export file as pkl

df_op_merged.to_pickle(os.path.join(path, 'Data', 'Prepared Data', 'orders_products_merged_derived.pkl'))