# Importing Libraries and Dataframe

In [22]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [23]:
# Create path
path = r'C:\Users\Neena Tilton\Dropbox\Projects\01_2020_InstacartBasket'

In [24]:
# Import dataframe saved as pickel
ords_prods_merged = pd.read_pickle(os.path.join(path, '02_Data', 'PreparedData', 'orders_products_merged.pkl'))

In [25]:
# Create a subset to avoid potential issues dealing with user-defined functions
df = ords_prods_merged[:1000000]

In [26]:
df.shape

(1000000, 14)

In [27]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


# Creating 'price_label' Column

## Using User-Defined Function

In [7]:
# User defined function

def price_label(row):
    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High range'
    else: return 'Not enough data'

In [8]:
# Test the new function
df['price_range'] = df.apply(price_label, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis = 1)


In [9]:
df['price_range'].value_counts(dropna = False)

Mid-range product    756450
Low-range product    243550
Name: price_range, dtype: int64

In [10]:
df['prices'].max()

14.8

## Using loc() function

In [11]:
# Performing same task as user-defined function above, using loc() and new column named 'price_range_loc'
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High Range Product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [12]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid Range Product'

In [13]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low Range Product'

In [14]:
# performing value count to see if output is same as user-defined function as above. 
df['price_range_loc'].value_counts(dropna = False)

Mid Range Product    756450
Low Range Product    243550
Name: price_range_loc, dtype: int64

## Using loc() function on whole dataframe

In [16]:
# Running the loc() function on the whole dataframe (not just the first million rows)
ords_prods_merged.loc[ords_prods_merged['prices'] > 15, 'price_range_loc'] = 'High Range Product'

In [17]:
ords_prods_merged.loc[(ords_prods_merged['prices'] <= 15) & (ords_prods_merged['prices'] > 5), 'price_range_loc'] = 'Mid Range Product'

In [18]:
ords_prods_merged.loc[ords_prods_merged['prices'] <= 5, 'price_range_loc'] = 'Low Range Product'

In [19]:
ords_prods_merged['price_range_loc'].value_counts(dropna = False)

Mid Range Product     21860860
Low Range Product     10126321
High Range Product      417678
Name: price_range_loc, dtype: int64

# Creating 'busiest_day' Column

## Using For-Loops


In [20]:
for x in range(30,45):
    print("My age is %d" % (x))

My age is 30
My age is 31
My age is 32
My age is 33
My age is 34
My age is 35
My age is 36
My age is 37
My age is 38
My age is 39
My age is 40
My age is 41
My age is 42
My age is 43
My age is 44


### For-loops for Instacard Project

In [28]:
# Goal is to create a new column that summarizes how busy each day of the week is.
# First, we find out which day has the most orders by using value_count
ords_prods_merged['orders_day_of_week'].value_counts(dropna = False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: orders_day_of_week, dtype: int64

In [29]:
result = []

for value in ords_prods_merged['orders_day_of_week']:
    
    if value == 0:
        result.append('Busiest day')
    elif value == 4:
        result.append('Least busy')
    else:
        result.append('Regular busy')

In [30]:
# Combine the new array named 'result' onto the dataframe
ords_prods_merged['busiest_day'] = result

In [31]:
# Check value count in the newly created column "busiest_day"
ords_prods_merged['busiest_day'].value_counts(dropna = False)

Regular busy    22416875
Busiest day      6204182
Least busy       3783802
Name: busiest_day, dtype: int64

# -------------
# 4.7 Task Exercise

## Step 2)
## Create a new column named "Busiest_days" showing two busiest and two slowest days. 

In [32]:
result_b = []

for value in ords_prods_merged['orders_day_of_week']:
    
    if value == 0 or value == 1:
        result_b.append('Busiest days')
    elif value == 4 or value == 3:
        result_b.append('Slowest days')
    else:
        result_b.append('Regular days')

In [33]:
ords_prods_merged['Busiest_days'] = result_b

## Step 3)
## Check the values of this new column for accuracy. 

In [34]:
ords_prods_merged['Busiest_days'].value_counts(dropna = False)

Regular days    12916111
Busiest days    11864412
Slowest days     7624336
Name: Busiest_days, dtype: int64

In [35]:
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,busiest_day,Busiest_days
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Regular busy,Regular days
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Regular busy,Slowest days
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Regular busy,Slowest days
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Least busy,Slowest days
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Least busy,Slowest days


## Step 4)
## Create a new column named "busiest_period_of_day"; identify busiest hours of the day, label periods of time (rather than by hour) as "Most orders," "Average orders," or "Fewest orders." 

In [37]:
# Identify busiest hours
ords_prods_merged['order_time_of_day'].value_counts(dropna = False)

10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: order_time_of_day, dtype: int64

In [None]:
# We need to categorize the hours into three time-frames: 
# "Most orders" time-frame is 9-16 (9am - 4pm) with 2.4 million + order per hour
# "Average orders" time-frame is 7-8 (7am - 8am) & 17-23 (5pm - 11pm) with orders between 300,000 ~ 2.4 million
# "Fewest orders" time-frame is 0-6 (12am - 6am) with less than 300,000 orders per hour

In [38]:
# Since we performing a task on whole dataframe, it is best to use loc() function
# loc() function to categorize 0-6 time-frame:
ords_prods_merged.loc[ords_prods_merged['order_time_of_day'] <= 6, 'busiest_period_of_day'] = 'fewest orders'

In [39]:
# loc() function to categorize 7-8 time-frame:
ords_prods_merged.loc[(ords_prods_merged['order_time_of_day'] > 6) & (ords_prods_merged['order_time_of_day'] < 9), 'busiest_period_of_day'] = 'average orders'

In [40]:
# loc() function to categorize 17-24 time-frame:
ords_prods_merged.loc[(ords_prods_merged['order_time_of_day'] > 16) & (ords_prods_merged['order_time_of_day'] < 24), 'busiest_period_of_day'] = 'average orders'

In [41]:
# loc() function to categorize 9-16 time-frame:
ords_prods_merged.loc[(ords_prods_merged['order_time_of_day'] > 8) & (ords_prods_merged['order_time_of_day'] < 17), 'busiest_period_of_day'] = 'most orders'

## Step 5)
## Print the frequency of this new column.

In [42]:
ords_prods_merged['busiest_period_of_day'].value_counts(dropna = False)

most orders       21118071
average orders    10399967
fewest orders       886821
Name: busiest_period_of_day, dtype: int64

In [43]:
# Checking head() to see what the column looks like. 
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,busiest_day,Busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Regular busy,Regular days,average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Regular busy,Slowest days,average orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Regular busy,Slowest days,most orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Least busy,Slowest days,average orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Least busy,Slowest days,most orders


## Step 6)
## Ensure your notebook is clean and well commented.

In [None]:
# done.

## Step 7) 
## Export your dataframe as a pickle file into the "PreparedData" folder. 

In [44]:
ords_prods_merged.to_pickle(os.path.join(path, '02_Data', 'PreparedData', 'orders_products_merged.pkl'))