# Market Basket Analysis POC

### Installation of libraries

In [None]:
%pip install -q mlxtend networkx


In [None]:
import validmind as vm

vm.init(
  api_host = "...",
  api_key = "...",
  api_secret = "...",
  model = "..."
)

In [None]:
import pandas as pd

## Dataset Loader

In [None]:

# Read the Excel file into a pandas DataFrame
retail = pd.read_csv('./datasets/mba/Online Retail.csv')

def preprocess(retail):
    # Drop rows where any of the elements is missing
    retail = retail.dropna()

    # Convert the 'Description' column to a categorical type (similar to factor in R)
    retail['Description'] = retail['Description'].astype('category')
    # Convert the 'Country' column to a categorical type
    retail['Country'] = retail['Country'].astype('category')
    # Convert the 'InvoiceDate' to datetime format if not already
    retail['InvoiceDate'] = pd.to_datetime(retail['InvoiceDate'])
    # Extract just the date part and store in a new column
    retail['Date'] = retail['InvoiceDate'].dt.date
    # Extract time from 'InvoiceDate' and store in another variable
    TransTime = retail['InvoiceDate'].dt.strftime('%H:%M:%S')
    # Convert 'InvoiceNo' to string first, then to numeric, handling errors by coercing invalid parses to NaN
    InvoiceNo = pd.to_numeric(retail['InvoiceNo'].astype(str), errors='coerce')
    # Add 'TransTime' and 'InvoiceNo' as new columns to the 'retail' DataFrame
    retail['TransTime'] = pd.to_datetime(retail['InvoiceDate']).dt.strftime('%H:%M:%S')  # Redefining for clarity
    retail['InvoiceNo'] = pd.to_numeric(retail['InvoiceNo'].astype(str), errors='coerce')
    retail['InvoiceNo'] = InvoiceNo
    return retail

retail = preprocess(retail)

# Display the first few rows of the DataFrame
print(retail.head())
# Display data types and non-null counts for each column
print(retail.info())

# Show summary statistics for numeric columns
print(retail.describe())

In [None]:
# Group by 'InvoiceNo' and 'Date', then apply a custom function to concatenate 'Description'
transactionData = retail.groupby(['InvoiceNo', 'Date'])['Description'].apply(lambda x: ', '.join(x.astype(str))).reset_index()

# Rename the concatenated descriptions column for clarity
transactionData.rename(columns={'Description': 'ConcatenatedDescriptions'}, inplace=True)

# Remove the 'InvoiceNo' and 'Date' columns
transactionData.drop(['InvoiceNo', 'Date'], axis=1, inplace=True)

# Rename the remaining column to 'items'
transactionData.columns = ['items']

# Display the modified DataFrame
print(transactionData)
# Write the DataFrame to a CSV file without quotes and without row names (indices)
transactionData.to_csv('./datasets/mba//market_basket_transactions.csv', index=False, quoting=0)

## model

In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

# Read the CSV file
data = pd.read_csv('./datasets/mba//market_basket_transactions.csv', header=None)

transactions = [row[0].split(',') for row in data.values]

# Use TransactionEncoder from mlxtend to encode the list of transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
transaction_df = pd.DataFrame(te_ary, columns=te.columns_)


In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

# Generate frequent itemsets
frequent_itemsets = apriori(transaction_df, min_support=0.01, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Display the rules
print(rules)

In [None]:
from mlxtend.frequent_patterns import apriori
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate item frequencies
item_frequencies = transaction_df.sum(axis=0).sort_values(ascending=False)[:20]

# Create a bar plot for the item frequencies
plt.figure(figsize=(12, 8))
sns.barplot(x=item_frequencies.values, y=item_frequencies.index, palette="Pastel2")
plt.title("Absolute Item Frequency Plot for the Top 20 Items")
plt.xlabel("Frequency")
plt.ylabel("Items")
plt.show()

In [None]:
# Calculate relative item frequencies
total_transactions = transaction_df.shape[0]
item_frequencies = (transaction_df.sum(axis=0) / total_transactions).sort_values(ascending=False)[:20]


import matplotlib.pyplot as plt
import seaborn as sns

# Create a bar plot for the relative item frequencies
plt.figure(figsize=(12, 8))
sns.barplot(x=item_frequencies.values, y=item_frequencies.index, palette="pastel")
plt.title("Relative Item Frequency Plot for the Top 20 Items")
plt.xlabel("Relative Frequency")
plt.ylabel("Items")
plt.show()


In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd


# Generate frequent itemsets
frequent_itemsets = apriori(transaction_df, min_support=0.01, use_colnames=True, max_len=3)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
# Display the rules
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))

In [None]:
def is_subset(df, rule_index):
    antecedents = df.loc[rule_index, 'antecedents']
    consequents = df.loc[rule_index, 'consequents']
    # Check each rule against all other rules
    for idx, row in df.iterrows():
        if idx != rule_index and antecedents.issubset(row['antecedents']) and consequents.issubset(row['consequents']):
            return True
    return False


# Assuming 'rules' is the DataFrame obtained from association_rules
rules['is_subset'] = [is_subset(rules, i) for i in rules.index]
# Filter rules to remove those marked as subsets
filtered_rules = rules[~rules['is_subset']]
print(f"Number of non-redundant rules: {len(filtered_rules)}")


In [None]:
filtered_rules

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'all_rules' is the DataFrame containing the association rules generated earlier

# Filter rules where confidence is greater than 0.4
subRules = filtered_rules[filtered_rules['confidence'] > 0.4]

# Setting the aesthetic style of seaborn plots
sns.set(style="whitegrid")

# Creating the scatter plot
plt.figure(figsize=(10, 6))
scatter = sns.scatterplot(x='support', y='confidence', data=subRules, size='lift', sizes=(50, 200), hue='lift', palette='viridis', legend='brief')

# Adding titles and labels
plt.title('Association Rules - Confidence vs. Support')
plt.xlabel('Support')
plt.ylabel('Confidence')

# Adding a legend with title
plt.legend(title='Lift')

# Showing the plot
plt.show()

In [None]:
top10subRules = subRules.sort_values(by='confidence', ascending=False).head(10)

import networkx as nx
import matplotlib.pyplot as plt

# Create a directed graph
G = nx.DiGraph()

# Add edges from antecedents to consequents for each rule
for _, rule in top10subRules.iterrows():
    antecedents = ', '.join(list(rule['antecedents']))
    consequents = ', '.join(list(rule['consequents']))
    G.add_edge(antecedents, consequents, weight=rule['confidence'])

# Position nodes using the spring layout
pos = nx.spring_layout(G, k=0.5, iterations=20)

plt.figure(figsize=(12, 8))

# Draw the graph
nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=2500, edge_color='gray', linewidths=1, font_size=10)

# Draw edge labels based on the confidence of each rule
edge_labels = {(u, v): f"{d['weight']:.2f}" for u, v, d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')

# Display the plot
plt.title('Top 10 Association Rules Graph')
plt.show()
