In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import t
from scipy import stats
from apyori import apriori

In [2]:
pd.set_option('display.max_rows', 200)

In [3]:
df = pd.read_csv('mma_mart.csv')

### Seperate refrigerated beerages

In [4]:
df.loc[df['aisle'] == 'refrigerated', 'department'] = 'beverages_refrigerated'
df.loc[df['department'] == 'beverages', 'department'] = 'beverages_non_refrigerated'

### Remove 'missing' department and 'missing' aisle

In [5]:
df = df[df['department'] != 'missing']

### Mark products as refrigerated or frozen

In [6]:
df['frozen'] = df['department'] == 'frozen'
dept_refrigerated = ['dairy eggs', 'deli', 'meat seafood', 'beverages_refrigerated']
df['refrigerated'] = df['department'].isin(dept_refrigerated)

### Largest Remainder Method

In [7]:
def largest_remainder_method(decimal_series):
    total_percentage = 100
    rounded_percentages = [round(decimal * total_percentage) for decimal in decimal_series]

    # Calculate the total after rounding
    rounded_total = sum(rounded_percentages)

    # Calculate the remainder for each percentage
    remainders = [(i, rounded_percentage % 1) for i, rounded_percentage in enumerate(rounded_percentages)]

    # Sort the remainders in descending order
    remainders.sort(key=lambda x: x[1], reverse=True)

    # Distribute the remaining percentage to the largest remainders
    remaining_percentage = total_percentage - rounded_total
    for i in range(remaining_percentage):
        index = remainders[i % len(remainders)][0]
        rounded_percentages[index] += 1

    return rounded_percentages

### Create Refrigerated products dataframe

In [8]:
df_refrige = df[df['refrigerated'] == True]

In [9]:
refridge_dept_counts = df_refrige.groupby('department').agg(
    products_sold=pd.NamedAgg(column='product_name', aggfunc='nunique'),
    total_items_sold=pd.NamedAgg(column='product_name', aggfunc='size')
).reset_index()

refridge_dept_counts['store_util'] = refridge_dept_counts['products_sold'] / refridge_dept_counts['products_sold'].sum()
refridge_dept_counts['dept_dominance'] = refridge_dept_counts['total_items_sold'] / refridge_dept_counts['total_items_sold'].sum()
refridge_dept_counts['aisle_allocation'] = largest_remainder_method(((refridge_dept_counts['store_util']+refridge_dept_counts['dept_dominance'] ) / 2))
#refridge_dept_counts.loc[refridge_dept_counts['department'] == 'beverages', 'department'] = 'beverages_refrigerated'

In [10]:
refridge_dept_counts

Unnamed: 0,department,products_sold,total_items_sold,store_util,dept_dominance,aisle_allocation
0,beverages_refrigerated,544,17663,0.104797,0.07493,9
1,dairy eggs,2886,164468,0.555962,0.697705,63
2,deli,1069,32008,0.205933,0.135784,17
3,meat seafood,692,21588,0.133308,0.091581,11


In [11]:
# Create a dictionary from the two columns
dept_allocation = dict(zip(refridge_dept_counts['department'], refridge_dept_counts['aisle_allocation']))

### Create General products dataframe

In [12]:
df_general = df[(df['refrigerated'] == False) & (df['frozen'] == False)]

In [13]:
general_dept_counts = df_general.groupby('department').agg(
    products_sold=pd.NamedAgg(column='product_name', aggfunc='nunique'),
    total_items_sold=pd.NamedAgg(column='product_name', aggfunc='size')
).reset_index()

general_dept_counts['store_util'] = general_dept_counts['products_sold'] / general_dept_counts['products_sold'].sum()
general_dept_counts['dept_dominance'] = general_dept_counts['total_items_sold'] / general_dept_counts['total_items_sold'].sum()
general_dept_counts['aisle_allocation'] = largest_remainder_method((general_dept_counts['store_util']+general_dept_counts['dept_dominance'] ) / 2)
general_dept_counts['aisle_allocation'] = general_dept_counts['aisle_allocation'] * 8
#general_dept_counts.loc[general_dept_counts['department'] == 'beverages', 'department'] = 'beverages_non_refrigerated'

In [14]:
general_dept_counts

Unnamed: 0,department,products_sold,total_items_sold,store_util,dept_dominance,aisle_allocation
0,alcohol,606,4580,0.0231,0.006725,16
1,babies,799,12872,0.030457,0.0189,16
2,bakery,1204,35806,0.045895,0.052575,40
3,beverages_non_refrigerated,2709,64260,0.103263,0.094356,80
4,breakfast,905,21585,0.034497,0.031694,24
5,bulk,33,1087,0.001258,0.001596,0
6,canned goods,1557,32486,0.05935,0.047701,40
7,dry goods pasta,1366,26096,0.05207,0.038318,40
8,household,2028,22445,0.077304,0.032957,48
9,international,772,8248,0.029427,0.012111,16


In [15]:
# Create a new dictionary from the DataFrame
temp_dict = dict(zip(general_dept_counts['department'], general_dept_counts['aisle_allocation']))
# Add the new dictionary to the existing dictionary
dept_allocation.update(temp_dict)
dept_allocation['frozen'] = 100

In [16]:
dept_allocation

{'beverages_refrigerated': 9,
 'dairy eggs': 63,
 'deli': 17,
 'meat seafood': 11,
 'alcohol': 16,
 'babies': 16,
 'bakery': 40,
 'beverages_non_refrigerated': 80,
 'breakfast': 24,
 'bulk': 0,
 'canned goods': 40,
 'dry goods pasta': 40,
 'household': 48,
 'international': 16,
 'other': 8,
 'pantry': 88,
 'personal care': 64,
 'pets': 8,
 'produce': 192,
 'snacks': 120,
 'frozen': 100}

### Check correlation between store_util and dept_dominance


In [17]:
# Calculate the correlation coefficient between the two variables
corr_coef = department_counts['store_util'].corr(department_counts['dept_dominance'])

print(f'Correlation coefficient between store_util and dept_dominance: {corr_coef}')

NameError: name 'department_counts' is not defined

Weak correlation between the number of products within each department and the total number of sales between each department. This means having more products in your department does not lead to more department sales.

### Confidence Interval of total_quantity_sold for each department

In [19]:
department_stats = {}

for department, data in top_1000.groupby('department'):
    mean_quantity = data['total_quantity_sold'].mean()
    std_quantity = data['total_quantity_sold'].std()
    sem_quantity = stats.sem(data['total_quantity_sold'])
    confidence_interval = [round(val, 2) for val in stats.t.interval(0.95, len(data)-1, loc=mean_quantity, scale=sem_quantity)]
    
    department_stats[department] = {
        'mean_quantity': round(mean_quantity, 2),
        'std_quantity': round(std_quantity, 2),
        'confidence_interval': confidence_interval
    }

# Display the results
for department, info in department_stats.items():
    print(f"Department: {department}")
    print(f"Mean Quantity Sold: {info['mean_quantity']}")
    print(f"Standard Deviation: {info['std_quantity']}")
    print(f"Confidence Interval: {info['confidence_interval']}")
    print()

NameError: name 'top_1000' is not defined

### Top 1000 items sold

In [20]:
# Group by 'product_name' and sum rows for sales count
df_quantity_sold = df.groupby('product_name').size().reset_index(name='total_quantity_sold')

# Merge with df to carryover product details
df_quantity_sold = pd.merge(df_quantity_sold, df.drop_duplicates('product_name'), on='product_name')
df_quantity_sold = df_quantity_sold.sort_values(by='total_quantity_sold', ascending=False)
# Find top 1000 products
top_1000 = df_quantity_sold.head(1000)

In [21]:
top_1000[top_1000['department'] == 'beverages_refrigerated']

Unnamed: 0,product_name,total_quantity_sold,order_id,product_id,aisle_id,aisle,department_id,department,frozen,refrigerated
301,100% Raw Coconut Water,1142,190,3957,31,refrigerated,7,beverages_refrigerated,False,True
23231,Original Orange Juice,752,4,25146,31,refrigerated,7,beverages_refrigerated,False,True
21845,Organic Raw Kombucha Gingerade,657,98,30776,31,refrigerated,7,beverages_refrigerated,False,True
15737,Lemonade,608,13,41290,31,refrigerated,7,beverages_refrigerated,False,True
23220,Original No Pulp 100% Florida Orange Juice,577,321,31683,31,refrigerated,7,beverages_refrigerated,False,True
31816,Trilogy Kombucha Drink,507,301,45603,31,refrigerated,7,beverages_refrigerated,False,True
19472,Orange Juice,506,7,34050,31,refrigerated,7,beverages_refrigerated,False,True
25557,Pulp Free Orange Juice,310,72,39108,31,refrigerated,7,beverages_refrigerated,False,True
18790,No Pulp Calcium & Vitamin D Pure Orange Juice,290,459,6631,31,refrigerated,7,beverages_refrigerated,False,True
30731,Synergy Organic Kombucha Gingerberry,288,30,48559,31,refrigerated,7,beverages_refrigerated,False,True


### Find department breakdown of top_1000 products

In [22]:
top_1000_department_counts = top_1000.groupby('department').agg(
    products_sold=pd.NamedAgg(column='product_name', aggfunc='nunique'),
).reset_index()

In [23]:
top_1000_department_counts['dept_top_1000_dominance'] = top_1000_department_counts['products_sold'] / top_1000_department_counts['products_sold'].sum()

In [24]:
top_1000_department_counts

Unnamed: 0,department,products_sold,dept_top_1000_dominance
0,alcohol,4,0.004
1,babies,8,0.008
2,bakery,42,0.042
3,beverages_non_refrigerated,69,0.069
4,beverages_refrigerated,19,0.019
5,breakfast,16,0.016
6,bulk,2,0.002
7,canned goods,38,0.038
8,dairy eggs,229,0.229
9,deli,35,0.035


### Find refridgerated breakdown of top_1000 products

In [25]:
top_1000_refrigerated = top_1000[top_1000['refrigerated'] == True]

print(f'Number of refrigerated products in the top_1000: {top_1000_refrigerated.shape[0]}')

Number of refrigerated products in the top_1000: 307


In [26]:
# Dateframe of the top 100 refrigerated products
top_100_refrigerated = top_1000_refrigerated.head(100)

### Picking Top 1000

In [27]:
selected_rows = []

# Iterate through the dictionary and DataFrame to pick items
for department, num_items in dept_allocation.items():
    department_rows = df_quantity_sold[df_quantity_sold['department'] == department].head(num_items)
    selected_rows.append(department_rows)

# Concatenate the selected rows into a new DataFrame
picked_df = pd.concat(selected_rows)

In [28]:
picked_df.head(10)

Unnamed: 0,product_name,total_quantity_sold,order_id,product_id,aisle_id,aisle,department_id,department,frozen,refrigerated
301,100% Raw Coconut Water,1142,190,3957,31,refrigerated,7,beverages_refrigerated,False,True
23231,Original Orange Juice,752,4,25146,31,refrigerated,7,beverages_refrigerated,False,True
21845,Organic Raw Kombucha Gingerade,657,98,30776,31,refrigerated,7,beverages_refrigerated,False,True
15737,Lemonade,608,13,41290,31,refrigerated,7,beverages_refrigerated,False,True
23220,Original No Pulp 100% Florida Orange Juice,577,321,31683,31,refrigerated,7,beverages_refrigerated,False,True
31816,Trilogy Kombucha Drink,507,301,45603,31,refrigerated,7,beverages_refrigerated,False,True
19472,Orange Juice,506,7,34050,31,refrigerated,7,beverages_refrigerated,False,True
25557,Pulp Free Orange Juice,310,72,39108,31,refrigerated,7,beverages_refrigerated,False,True
18790,No Pulp Calcium & Vitamin D Pure Orange Juice,290,459,6631,31,refrigerated,7,beverages_refrigerated,False,True
22719,Organic Whole Milk,4089,14,27845,84,milk,16,dairy eggs,False,True


In [29]:
#picked_df.to_csv('items_1000.csv', index=False)

In [30]:
# Filter rows where 'column1' contains the word 'garlic'
filtered_df = picked_df[picked_df['product_name'].str.contains('apple', case=False)]

filtered_df

Unnamed: 0,product_name,total_quantity_sold,order_id,product_id,aisle_id,aisle,department_id,department,frozen,refrigerated
3806,Broccoli & Apple Stage 2 Baby Food,196,902,3020,92,baby food formula,18,babies,False,False
29686,Stage 1 Apples Sweet Potatoes Pumpkin & Bluebe...,184,1376,5491,92,baby food formula,18,babies,False,False
1837,Apple and Carrot Stage 2 Baby Food,128,163,47888,92,baby food formula,18,babies,False,False
19708,Organic Apple Juice,335,529,18811,98,juice nectars,7,beverages_non_refrigerated,False,False
19710,Organic Apple Juice Boxes,184,1521,13113,98,juice nectars,7,beverages_non_refrigerated,False,False
259,100% Pure Apple Juice,175,254,45840,98,juice nectars,7,beverages_non_refrigerated,False,False
83,100% Apple Juice,162,480,35199,98,juice nectars,7,beverages_non_refrigerated,False,False
24536,Pink Lady Apple Kombucha,159,657,4462,94,tea,7,beverages_non_refrigerated,False,False
1764,Apple Cinnamon Instant Oatmeal,173,123,37464,130,hot cereal pancake mixes,14,breakfast,False,False
19720,Organic AppleApple,491,105,26790,99,canned fruit applesauce,15,canned goods,False,False
