In [1]:
# Import the packages
import pandas as pd 
from tabulate import tabulate
from prettytable import PrettyTable

## Load and Check the Data

In [2]:
# load the data
file_path = r'C:\Users\Teresa - School\Documents\Projects\Rotman_Datathon_23\mma_mart.csv'

mma_mart = pd.read_csv(file_path)

# Preview the data
print(mma_mart)

        order_id  product_id                                   product_name  \
0              1       49302                               Bulgarian Yogurt   
1              1       11109  Organic 4% Milk Fat Whole Milk Cottage Cheese   
2              1       10246                          Organic Celery Hearts   
3              1       49683                                 Cucumber Kirby   
4              1       43633           Lightly Smoked Sardines in Olive Oil   
...          ...         ...                                            ...   
987254    100000       30169        Total 2% All Natural Plain Greek Yogurt   
987255    100000       38734                           Wheat Sandwich Thins   
987256    100000       36759         Unscented Long Lasting Stick Deodorant   
987257    100000       37107                                   Ground Cumin   
987258    100000       31506                         Extra Virgin Olive Oil   

        aisle_id                 aisle  department_

In [3]:
# Check for missing values

print(mma_mart.isnull().sum())

order_id         0
product_id       0
product_name     0
aisle_id         0
aisle            0
department_id    0
department       0
dtype: int64


In [4]:
# Check the data types

mma_mart.dtypes

order_id          int64
product_id        int64
product_name     object
aisle_id          int64
aisle            object
department_id     int64
department       object
dtype: object

# #  Count the Number of Values by Column

In [5]:
# Count the number of values by product_name
# Import the counter module 
from collections import Counter

product_name_values = mma_mart['product_name']
product_name_counts = Counter(product_name_values) 

# Count the number of values by aisle names
aisle_name_values = mma_mart['aisle']
aisle_name_counts = Counter(aisle_name_values)

# Count the number of values by department names 
department_name_values = mma_mart['department']
department_name_counts = Counter(department_name_values)

In [6]:
# Check the results for product names
print(product_name_counts)

Counter({'Banana': 14494, 'Bag of Organic Bananas': 11694, 'Organic Strawberries': 8081, 'Organic Baby Spinach': 7369, 'Organic Hass Avocado': 6411, 'Organic Avocado': 5313, 'Large Lemon': 4688, 'Strawberries': 4340, 'Limes': 4225, 'Organic Raspberries': 4164, 'Organic Whole Milk': 4089, 'Organic Yellow Onion': 3360, 'Organic Garlic': 3290, 'Organic Zucchini': 3203, 'Organic Blueberries': 3107, 'Cucumber Kirby': 2905, 'Organic Lemon': 2718, 'Organic Fuji Apple': 2692, 'Apple Honeycrisp Organic': 2579, 'Organic Grape Tomatoes': 2553, 'Seedless Red Grapes': 2520, 'Organic Cucumber': 2479, 'Honeycrisp Apple': 2467, 'Organic Half & Half': 2401, 'Organic Baby Carrots': 2348, 'Organic Gala Apples': 2307, 'Organic Large Extra Fancy Fuji Apple': 2272, 'Sparkling Water Grapefruit': 2270, 'Carrots': 2224, 'Yellow Onions': 2195, 'Organic Baby Arugula': 2180, 'Fresh Cauliflower': 2151, 'Organic Small Bunch Celery': 2145, 'Original Hummus': 2136, 'Organic Cilantro': 2100, 'Michigan Organic Kale': 2

In [7]:
# Check the results for aisle names 
print(aisle_name_counts)

Counter({'fresh fruits': 110888, 'fresh vegetables': 103891, 'packaged vegetables fruits': 54207, 'yogurt': 43857, 'packaged cheese': 29770, 'milk': 26942, 'water seltzer sparkling water': 25354, 'chips pretzels': 21958, 'soy lactosefree': 19566, 'bread': 17806, 'refrigerated': 17663, 'frozen produce': 16000, 'ice cream ice': 15227, 'crackers': 14020, 'eggs': 13874, 'energy granola bars': 13861, 'lunch meat': 12003, 'frozen meals': 11923, 'baby food formula': 11660, 'fresh herbs': 11339, 'cereal': 11328, 'soft drinks': 10990, 'fresh dips tapenades': 10814, 'soup broth bouillon': 10383, 'juice nectars': 10142, 'baking ingredients': 9757, 'cream': 9592, 'nuts seeds dried fruit': 9419, 'other creams cheeses': 9374, 'hot dogs bacon sausage': 9268, 'candy chocolate': 9200, 'canned jarred vegetables': 9079, 'spreads': 8770, 'packaged produce': 8450, 'canned meals beans': 8294, 'dry pasta': 8031, 'butter': 7744, 'tea': 7712, 'breakfast bakery': 7693, 'oils vinegars': 7542, 'paper goods': 7228

In [8]:
# Check the results for department names 
print(department_name_counts)

Counter({'produce': 288775, 'dairy eggs': 164468, 'snacks': 88093, 'beverages': 81923, 'frozen': 68202, 'pantry': 56681, 'bakery': 35806, 'canned goods': 32486, 'deli': 32008, 'dry goods pasta': 26096, 'household': 22445, 'meat seafood': 21588, 'breakfast': 21585, 'personal care': 13773, 'babies': 12872, 'international': 8248, 'alcohol': 4580, 'pets': 3157, 'missing': 2289, 'other': 1097, 'bulk': 1087})


## Export the Results

In [10]:
# Create dataframes for each list of values 
product_df = pd.DataFrame(list(product_name_counts.items()), columns=['Product Name', 'Product Name Count'])
aisle_df = pd.DataFrame(list(aisle_name_counts.items()), columns=['Aisle Name', 'Aisle Name Count'])
department_df = pd.DataFrame(list(department_name_counts.items()), columns=['Department Name', 'Department Count'])

# Preview the results 
print(aisle_df)

                     Aisle Name  Aisle Name Count
0                        yogurt             43857
1          other creams cheeses              9374
2              fresh vegetables            103891
3           canned meat seafood              2131
4                  fresh fruits            110888
..                          ...               ...
129                   first aid               299
130                 white wines               916
131                eye ear care               278
132                   skin care               315
133  specialty wines champagnes               347

[134 rows x 2 columns]


In [11]:
# Combine counts by putting them into a list
combined_dfs = [product_df, aisle_df, department_df]

# Preview results
print(combined_dfs)

[                                        Product Name  Product Name Count
0                                   Bulgarian Yogurt                   5
1      Organic 4% Milk Fat Whole Milk Cottage Cheese                 134
2                              Organic Celery Hearts                 775
3                                     Cucumber Kirby                2905
4               Lightly Smoked Sardines in Olive Oil                  14
...                                              ...                 ...
35065                           Special Cocoa Spread                   1
35066                     Small Batch 9 Year Bourbon                   1
35067                       Sliced Colby Jack Cheese                   1
35068                                Peppermint Bark                   1
35069              Healthy Trinity, 3 in 1, Capsules                   1

[35070 rows x 2 columns],                      Aisle Name  Aisle Name Count
0                        yogurt             43

In [12]:
# Turn list into dataframe
combined_df = pd.concat(combined_dfs)

print(combined_df)

                                     Product Name  Product Name Count  \
0                                Bulgarian Yogurt                 5.0   
1   Organic 4% Milk Fat Whole Milk Cottage Cheese               134.0   
2                           Organic Celery Hearts               775.0   
3                                  Cucumber Kirby              2905.0   
4            Lightly Smoked Sardines in Olive Oil                14.0   
..                                            ...                 ...   
16                                            NaN                 NaN   
17                                            NaN                 NaN   
18                                            NaN                 NaN   
19                                            NaN                 NaN   
20                                            NaN                 NaN   

   Aisle Name  Aisle Name Count Department Name  Department Count  
0         NaN               NaN             NaN        

In [13]:
# Write data to csv file 
combined_df.to_csv("combined_counts.csv", encoding='utf8', index=False)