In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
pip install apyori

Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5953 sha256=d295ee85265d34f0e6344ca5bde9849a80d1620061eedd526744fad34d34d7ba
  Stored in directory: /root/.cache/pip/wheels/c4/1a/79/20f55c470a50bb3702a8cb7c94d8ada15573538c7f4baebe2d
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from apyori import apriori
import networkx as nx

In [4]:
df = pd.read_csv("/content/groceries.csv", header=None)
df = df.drop(0, axis=1)
df = df.drop(0)
df

FileNotFoundError: [Errno 2] No such file or directory: '/content/groceries.csv'

In [None]:
# Check number of rows and columns
df.shape

In [None]:
df.size

In [None]:
# Item frequency analysis
item_counts = df.apply(pd.Series.value_counts).sum(axis=1).sort_values(ascending=False)
top_items = item_counts.head(10)
plt.bar(top_items.index, top_items.values)
plt.xlabel('Item')
plt.ylabel('Frequency')
plt.title('Top 10 Most Frequent Items')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Transaction analysis
transaction_lengths = df.notnull().sum(axis=1)
plt.hist(transaction_lengths,bins=sum(range(1, max(transaction_lengths) + 1)))
plt.xlabel('Transaction Length')
plt.ylabel('Frequency')
plt.title('Distribution of Transaction Lengths')
plt.show()

In [None]:
transactions = []
for i in range(9834):
  transactions.append([str(df.values[i,j]) for j in range(31)])
print(transactions)

In [None]:
rules = apriori(transactions = transactions, min_support = 0.003, min_confidence = 0.2, min_lift = 3, min_length = 2, max_length = 2)

In [None]:
# Display the results coming directly from the output of the apriori function
results = list(rules)
results[:3]

In [None]:
# Put the results well organised into a Pandas DataFrame
def inspect(results):
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(lhs, rhs, supports, confidences, lifts))

resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['Product 1', 'Product 2', 'Support', 'Confidence', 'Lift'])

In [None]:
# Display the results non sorted
resultsinDataFrame

In [None]:
# Display the results sorted by descending lifts
resultsinDataFrame.nlargest(n = 13, columns = 'Lift')

In [None]:
# Plot support vs confidence
plt.scatter(resultsinDataFrame['Support'], resultsinDataFrame['Confidence'], alpha=0.5)
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Support vs Confidence')
plt.show()

In [None]:
# Plot support vs lift
plt.scatter(resultsinDataFrame['Support'], resultsinDataFrame['Lift'], alpha=0.5)
plt.xlabel('Support')
plt.ylabel('Lift')
plt.title('Support vs Lift')
plt.show()

In [None]:
# Plot lift vs confidence
plt.scatter(resultsinDataFrame['Lift'], resultsinDataFrame['Confidence'], alpha=0.5)
plt.xlabel('Lift')
plt.ylabel('Confidence')
plt.title('Lift vs Confidence')
plt.show()

In [None]:
# Lift chart
products_1 = resultsinDataFrame["Product 1"].tolist()
products_2 = resultsinDataFrame["Product 2"].tolist()
lifts = resultsinDataFrame["Lift"].tolist()

rules_labels = [f"{product1} -> {product2}" for product1, product2 in zip(products_1, products_2)]

plt.bar(rules_labels, lifts)
plt.xlabel("Rules (If-Then)")
plt.ylabel("Lift")
plt.title("Lift Chart for Frequent Itemset Rules (min_lift=3)")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# prompt: create dataframe from 2 columns
lc = pd.DataFrame({'Lift': lifts, 'Combinations': rules_labels})

In [None]:
# prompt: sort lc order by lift

lc.sort_values(by=['Lift'], inplace=True, ascending=False)
lc


In [None]:
# prompt: create bar graph using lc

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
plt.barh(lc['Combinations'], lc['Lift'], color='lightblue')
plt.xlabel('Lift', fontsize=16)
plt.ylabel('Rules', fontsize=16)
plt.title('Bar Graph of Lift Values for Frequent Itemset Rules', fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Association rules network graph
G = nx.Graph()

for product in resultsinDataFrame['Product 1']:
    G.add_node(product)
for product in resultsinDataFrame['Product 2']:
    G.add_node(product)

for index, row in resultsinDataFrame.iterrows():
    G.add_edge(row['Product 1'], row['Product 2'], weight=row['Confidence'])
plt.figure(figsize=(15, 15))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_size=2000, node_color='skyblue', font_size=10)

edge_labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

plt.title('Association Rules Network Graph')
plt.show()

In [None]:
# prompt: how do i get the most sutaible product 2 if i enter product 1

def suitable_product_2(product_1):

  # Find all rules that have product 1 as the first product.
  rules_with_product_1 = resultsinDataFrame[resultsinDataFrame['Product 1'] == product_1]

  # Sort the rules by confidence in descending order.
  rules_with_product_1 = rules_with_product_1.sort_values(by='Confidence', ascending=False)

  # Return the product 2 of the first rule.
  return rules_with_product_1['Product 2'].iloc[0]

# Example usage
product_1 = input("Enter the name of the first product: ")
most_suitable_product_2 = suitable_product_2(product_1)

print(f"The most suitable product to be bought with {product_1} is: {most_suitable_product_2}")


In [None]:
# prompt: how do i get the most sutaible product 2 if i enter product 1

def suitable_product_2(product_1):

  # Find all rules that have product 1 as the first product.
  rules_with_product_1 = resultsinDataFrame[resultsinDataFrame['Product 1'] == product_1]

  # Sort the rules by confidence in descending order.
  rules_with_product_1 = rules_with_product_1.sort_values(by='Confidence', ascending=False)

  # Return the product 2 of the first rule.
  return rules_with_product_1[['Product 2','Lift']].iloc[0:len(rules_with_product_1)]

# Example usage
product_1 = input("Enter the name of the first product: ")
most_suitable_product_2 = suitable_product_2(product_1)

print(f"The most suitable product to be bought with {product_1} is/are: \n{most_suitable_product_2}")
