In [1]:
# Import the packages
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import pandas as pd

# Analyze the Summary Statistics

In [2]:
# load the data
file_path = r'C:\Users\Teresa - School\Documents\Projects\Rotman_Datathon_23\clean_mma_mart.csv'

mma_mart = pd.read_csv(file_path)

# Preview the data
print(mma_mart)

        order_id  product_id                                   product_name  \
0              1       49302                               Bulgarian Yogurt   
1              1       11109  Organic 4% Milk Fat Whole Milk Cottage Cheese   
2              1       10246                          Organic Celery Hearts   
3              1       49683                                 Cucumber Kirby   
4              1       43633           Lightly Smoked Sardines in Olive Oil   
...          ...         ...                                            ...   
984965    100000       30169        Total 2% All Natural Plain Greek Yogurt   
984966    100000       38734                           Wheat Sandwich Thins   
984967    100000       36759         Unscented Long Lasting Stick Deodorant   
984968    100000       37107                                   Ground Cumin   
984969    100000       31506                         Extra Virgin Olive Oil   

        aisle_id                 aisle  department_

In [3]:
# Count the number of orders 
distinct_orders = mma_mart['order_id'].nunique()
print("Count of distinct orders:", distinct_orders)

# Count the number of products
distinct_products = mma_mart['product_name'].nunique()
print("Count of distinct products:",distinct_products)

# Count the number of aisles
distinct_aisles = mma_mart['aisle_id'].nunique()
print("Count of distinct aisles:", distinct_aisles)

# Count the number of departments
distinct_dept = mma_mart['department_id'].nunique()
print("Count of distinct departments:", distinct_dept)

Count of distinct orders: 97828
Count of distinct products: 34552
Count of distinct aisles: 133
Count of distinct departments: 20


In [4]:
# Calculate the average number of of products by order
avg_products_by_order = mma_mart.groupby('order_id')['product_name'].nunique().mean()
print("Average Number of Products by Order:", avg_products_by_order)

# Calculate the average number of aisles by order 
avg_aisles_by_order = mma_mart.groupby('order_id')['aisle_id'].nunique().mean()
print("Average Number of Aisles by Order:", avg_aisles_by_order)

# Calculate the average number of departments by order
avg_dept_by_order = mma_mart.groupby('order_id')['department_id'].nunique().mean()
print("Average Number of Departments by Order:", avg_dept_by_order)

Average Number of Products by Order: 10.068385329353559
Average Number of Aisles by Order: 7.239910863965327
Average Number of Departments by Order: 4.718485505172343


# Find the Top Products

In [5]:
# Count the number of times a product was purchased
product_counts = mma_mart['product_name'].value_counts()

# Take the top 1000 products 
top_products_by_count = product_counts.head(1000)
print(top_products_by_count)

Banana                                         14494
Bag of Organic Bananas                         11694
Organic Strawberries                            8081
Organic Baby Spinach                            7369
Organic Hass Avocado                            6411
                                               ...  
Multigrain Pita Chips                            166
Frosted Mini-Wheats Original Cereal              165
Crushed Tomatoes With Basil                      165
Naked Green Machine Boosted Juice Smoothie       165
Cannellini White Kidney Beans No Salt Added      165
Name: product_name, Length: 1000, dtype: int64


In [6]:
top_product_counts = mma_mart.groupby(['product_id','product_name','department_id','department','aisle_id','aisle']).size().reset_index(name='count').sort_values(by='count', ascending=False)
print(top_product_counts)

       product_id                                    product_name  \
17302       24852                                          Banana   
9083        13176                          Bag of Organic Bananas   
14668       21137                            Organic Strawberries   
15219       21903                            Organic Baby Spinach   
32843       47209                            Organic Hass Avocado   
...           ...                                             ...   
15302       22013                                     Witch Hazel   
15303       22014                           Udo's Oil 3-6-9 Blend   
4218         6070                    Sesame Ginger Teriyaki Sauce   
1239         1813                          Sodel Alm Mlk Wlnut 48   
6169         8890  Light Parmesan Peppercorn Ranch Salad Dressing   

       department_id     department  aisle_id                       aisle  \
17302              4        produce        24                fresh fruits   
9083             

In [9]:
# Extract the frozen foods  
frozen_foods = mma_mart[mma_mart['department'].str.contains('frozen')]
print(frozen_foods)

        order_id  product_id                                   product_name  \
68             7       46802                               Pineapple Chunks   
100           11       30162         Teriyaki & Pineapple Chicken Meatballs   
114           12       38050  All Natural Boneless Skinless Chicken Breasts   
118           12       29471                        Combination Pizza Rolls   
135           14         162                 Organic Mini Homestyle Waffles   
...          ...         ...                                            ...   
984887     99993       43749                 Organic Ice Cream Vanilla Bean   
984891     99993        5537   Dairy Free Coconut Milk Frozen Dessert Minis   
984895     99993       26128                           Organic Mango Chunks   
984921     99996        9434                            Bag of Large Lemons   
984926     99996       32691                              Vanilla Ice Cream   

        aisle_id                    aisle  departme

In [10]:
# Count the number of occurrences of each product in descending order and print the results 
frozen_product_counts = frozen_foods.groupby(['product_id','product_name','department_id','department','aisle_id','aisle']).size().reset_index(name='count').sort_values(by='count', ascending=False)

print(frozen_product_counts)

      product_id                                       product_name  \
607         9076                                        Blueberries   
1323       20995                           Organic Broccoli Florets   
1542       24489                         Organic Whole Strawberries   
1147       17948                    Frozen Organic Wild Blueberries   
2945       46802                                   Pineapple Chunks   
...          ...                                                ...   
2303       36825                       Cookie Dough Grand Ice Cream   
1547       24558       Strawberry Snickerdoole Ice Cream Sandwiches   
484         7002      Chicken, Broccoli & Cheddar Frozen Sandwiches   
2717       43493                Boneless Pork Rib Shaped Potty Meal   
2606       41835  4 Pull-Apart Dipping Strips Four Cheese with M...   

      department_id department  aisle_id           aisle  count  
607               1     frozen       116  frozen produce   1759  
1323           

In [11]:
# Extract the top 100 frozen products sold 
top_frozen_products_sold = frozen_product_counts.head(100)

# Create a csv 
top_frozen_products_sold.to_csv("top_frozen_products.csv", encoding='utf8', index=False)

In [12]:
# Create a dataset excluding frozen products
non_frozen_products = mma_mart[~mma_mart['department'].str.contains('frozen')]
print(non_frozen_products)

        order_id  product_id                                   product_name  \
0              1       49302                               Bulgarian Yogurt   
1              1       11109  Organic 4% Milk Fat Whole Milk Cottage Cheese   
2              1       10246                          Organic Celery Hearts   
3              1       49683                                 Cucumber Kirby   
4              1       43633           Lightly Smoked Sardines in Olive Oil   
...          ...         ...                                            ...   
984965    100000       30169        Total 2% All Natural Plain Greek Yogurt   
984966    100000       38734                           Wheat Sandwich Thins   
984967    100000       36759         Unscented Long Lasting Stick Deodorant   
984968    100000       37107                                   Ground Cumin   
984969    100000       31506                         Extra Virgin Olive Oil   

        aisle_id                 aisle  department_

In [13]:
# Create a list of departments with refrigerated products
list_refrigerated_dept = ['dairy eggs', 'meat seafood','deli']

# Filter for refridgerated products 
refrigerated_products = mma_mart[mma_mart['department'].str.contains('|'.join(list_refrigerated_dept))]

print(refrigerated_products)

        order_id  product_id  \
0              1       49302   
1              1       11109   
7              1       22035   
8              2       33120   
17             3       33754   
...          ...         ...   
984952     99999       10034   
984953     99999       44085   
984954     99999        9558   
984964    100000       30489   
984965    100000       30169   

                                             product_name  aisle_id  \
0                                        Bulgarian Yogurt       120   
1           Organic 4% Milk Fat Whole Milk Cottage Cheese       108   
7                             Organic Whole String Cheese        21   
8                                      Organic Egg Whites        86   
17      Total 2% with Strawberry Lowfat Greek Strained...       120   
...                                                   ...       ...   
984952                Reduced Fat Mozarella String Cheese        21   
984953                   Vanilla Light & Fit Gr

In [14]:
# Count the number of refrigerated products by product name 
refrigerated_product_counts = refrigerated_products.groupby(['product_id','product_name','department_id','department','aisle_id','aisle']).size().reset_index(name='count').sort_values(by='count', ascending=False)

# Preview the results 
print(refrigerated_product_counts)

      product_id                                       product_name  \
2606       27845                                 Organic Whole Milk   
4604       49235                                Organic Half & Half   
2847       30489                                    Original Hummus   
2533       27086                                        Half & Half   
2054       22035                        Organic Whole String Cheese   
...          ...                                                ...   
2917       31167                 Natural Casing Old Fashion Wieners   
3903       41904          Kefir, Organic, Mango, Orange & Pineapple   
3904       41907                                    Cured Salt Pork   
802         8905  Salumeria Salamini Series Columbus Peppered Sa...   
0              9                  Light Strawberry Blueberry Yogurt   

      department_id    department  aisle_id                   aisle  count  
2606             16    dairy eggs        84                    milk   

In [15]:
# Extract the top 100 refridgerated products sold 
top_refrigerated_products_sold = refrigerated_product_counts.head(100)

# Create a csv 
top_refrigerated_products_sold.to_csv("top_refrigerated_products_sold.csv", encoding='utf8', index=False)

In [16]:
# Subset data for non-frozen and non-refridgerated products
non_frozen_refrigerated_products = non_frozen_products[~non_frozen_products['department'].isin(refrigerated_products['department'])]

# Preview the results 
print(non_frozen_refrigerated_products)

        order_id  product_id                            product_name  \
2              1       10246                   Organic Celery Hearts   
3              1       49683                          Cucumber Kirby   
4              1       43633    Lightly Smoked Sardines in Olive Oil   
5              1       13176                  Bag of Organic Bananas   
6              1       47209                    Organic Hass Avocado   
...          ...         ...                                     ...   
984963    100000       19508                          Corn Tortillas   
984966    100000       38734                    Wheat Sandwich Thins   
984967    100000       36759  Unscented Long Lasting Stick Deodorant   
984968    100000       37107                            Ground Cumin   
984969    100000       31506                  Extra Virgin Olive Oil   

        aisle_id                 aisle  department_id     department  
2             83      fresh vegetables              4        pro

In [17]:
# Check the results 
print(non_frozen_refrigerated_products[non_frozen_refrigerated_products['department'].str.contains('deli')])

Empty DataFrame
Columns: [order_id, product_id, product_name, aisle_id, aisle, department_id, department]
Index: []


In [None]:
# Create a 

# Finding the Top Products