# Sample transactions_clean dataset

In [None]:
from google.cloud import bigquery
from google.oauth2 import service_account

## construct credentials from service account key file
credentials = service_account.Credentials.from_service_account_file(
    '/content/shu88-isom676-srvacct_srvacct.json') ## relative file path

## construct a BigQuery client object
client = bigquery.Client(credentials=credentials)

In [None]:
# Randomly strategied 50000 customers sample the transactions_clean table
QUERY = (
"""
SELECT *
FROM `machine_learning.transactions_clean`
WHERE cust_id IN (
  SELECT cust_id
  FROM (
    SELECT DISTINCT cust_id
    FROM `machine_learning.transactions_clean`
  ) AS UniqueCustomers
  ORDER BY RAND()
  LIMIT 50000
)
"""
)

query_job = client.query(QUERY)  # API request
sample_transactions = query_job.to_dataframe()  # Converts the query result directly into a pandas DataFrame


# General EDA & Distribution

In [None]:
# Generate a table showing distribution of customer count

QUERY = (
"""
-- Step 1: Aggregate the number of customers per subcategory count
SELECT
  subcategory_count,
  COUNT(cust_id) AS customer_count
FROM (
  -- Step 2: Count the number of unique subcategories each customer has purchased
  SELECT
    t.cust_id,
    COUNT(DISTINCT p.prod_subcategory) AS subcategory_count
  FROM
    `machine_learning.transactions_clean` AS t
  JOIN
    `machine_learning.products` AS p
  ON
    t.prod_id = p.prod_id
  GROUP BY
    t.cust_id
)
GROUP BY
  subcategory_count
ORDER BY
  subcategory_count;
"""
)

query_job = client.query(QUERY)  # API request
distribution = query_job.to_dataframe()  # Converts the query result directly into a pandas DataFrame


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Draw a graph of the distribution of Customer Count
sns.set(style="whitegrid")

# Plotting distribution of customer_count
ax = sns.histplot(distribution['customer_count'], bins=15, kde=True)
plt.title('Distribution of Customer Count')

# Setting x-axis to start from 1
ax.set_xlim(left=1)

# Setting y-axis to start from 1
ax.set_ylim(bottom=1)

plt.show()

In [None]:
# Perform the join on 'prod_id'
merged_data = pd.merge(sample_trans, p_table, on='prod_id', suffixes=('_trans', '_prod'))

# Step 1: Count the number of unique subcategories each customer has purchased
cust_subcategory_count = merged_data.groupby('cust_id')['prod_subcategory'].nunique().reset_index()
cust_subcategory_count.rename(columns={'prod_subcategory': 'subcategory_count'}, inplace=True)

# Step 2: Aggregate the number of customers per subcategory count
customer_count_per_subcategory = cust_subcategory_count.groupby('subcategory_count').cust_id.count().reset_index()
customer_count_per_subcategory.rename(columns={'cust_id': 'customer_count'}, inplace=True)

# Order by subcategory_count
customer_count_per_subcategory.sort_values(by='subcategory_count', inplace=True)


# Draw a graph of the Distribution of Customer Count by Sub-category
sns.set(style="whitegrid")

# Plotting distribution of customer_count by sub-category
ax = sns.histplot(customer_count_per_subcategory['customer_count'], bins=15, kde=True)
plt.title('Distribution of Customer Count by Sub-category')

# Setting x-axis to start from 1
ax.set_xlim(left=1)

# Setting y-axis to start from 1
ax.set_ylim(bottom=1)

plt.show()

# Data Preprocessing

In [None]:
QUERY = (
"""
SELECT *
FROM `machine_learning.products`
"""
)

query_job = client.query(QUERY)  # API request
products = query_job.to_dataframe()  # Converts the query result directly into a pandas DataFrame


In [None]:
# List of product IDs to be dropped
drop_prod_ids = [20640707002, 20640707004, 20313716, 20318643]

# Drop specified product IDs
products = products[~products['prod_id'].isin(drop_prod_ids)]

# Update units of measure and corresponding values
# Create a mask for rows where unit of measure is KG or L
mask_kg = products['prod_count_uom'] == 'KG'
mask_l = products['prod_count_uom'] == 'L'

# Update 'prod_count_uom' for KG and L
products.loc[mask_kg, 'prod_count_uom'] = 'G'
products.loc[mask_l, 'prod_count_uom'] = 'ml'

# Update 'prod_uom_value' by multiplying by 1000 where the unit was KG or L
products.loc[mask_kg, 'prod_uom_value'] *= 1000
products.loc[mask_l, 'prod_uom_value'] *= 1000


In [None]:
# Join both tables: products and sample_transactions
merged_data = pd.merge(sampled_transactions, products, on="prod_id", how="left")

# Select General Mills customers
genm_customers = merged_data[merged_data['prod_mfc_brand_cd'] == 'GENM']['cust_id'].unique()

In [None]:
# Filter down to target customers
acse_subcategories = ['Breakfast', 'Halloween', 'Cereal Rte', 'Coating Mixes', 'Crackers/Health Cake', 'Nutritional Portable']
acse_transactions = merged_data[(merged_data['prod_mfc_brand_cd'] == 'ACSE') & (merged_data['prod_subcategory'].isin(acse_subcategories))]
acse_customers = acse_transactions['cust_id'].unique()

target_customers = [customer for customer in genm_customers if customer not in acse_customers]

# Filter the joined table and only keep our target customers
filtered_transactions = merged_data[merged_data['cust_id'].isin(target_customers)]

In [None]:
# Ensure the transaction date is a datetime object and conduct train-test split
sampled_transactions['trans_dt'] = pd.to_datetime(sampled_transactions['trans_dt'])

# Define the date ranges
start_date_train = pd.Timestamp('2019-01-01')
end_date_train = pd.Timestamp('2019-09-30')
start_date_test = pd.Timestamp('2019-10-01')
end_date_test = pd.Timestamp('2019-12-31')

# Filter transactions for target customers
target_transactions = sampled_transactions[sampled_transactions['cust_id'].isin(target_customers)]

# Split the data into train and test based on the date ranges
train = target_transactions[(target_transactions['trans_dt'] >= start_date_train) & (target_transactions['trans_dt'] <= end_date_train)]
test = target_transactions[(target_transactions['trans_dt'] >= start_date_test) & (target_transactions['trans_dt'] <= end_date_test)]

# Identify customers present in both datasets
train_customers = train['cust_id'].unique()
test_customers = test['cust_id'].unique()
common_customers = [customer for customer in train_customers if customer in test_customers]

# Further filter train and test datasets to include only common customers
train = train[train['cust_id'].isin(common_customers)]
test = test[test['cust_id'].isin(common_customers)]

In [None]:
# Merge train dataset with product details
train_with_products = pd.merge(train, products, on="prod_id", how="left")
df_train = pd.DataFrame(train_with_products)

# Merge test dataset with product details
test_with_products = pd.merge(test, products, on="prod_id", how="left")
df_test = pd.DataFrame(test_with_products)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Convert date into month and dayofweek to capture seasonality
df_train['trans_dt'] = pd.to_datetime(df_train['trans_dt'])
df_train['month'] = df_train['trans_dt'].dt.month
df_train['dayofweek'] = df_train['trans_dt'].dt.dayofweek

df_test['trans_dt'] = pd.to_datetime(df_test['trans_dt'])
df_test['month'] = df_test['trans_dt'].dt.month
df_test['dayofweek'] = df_test['trans_dt'].dt.dayofweek

# Select categories that both appear in General Mills and Kellogg
top_subcategories = ['Cereal Rte', 'Nutritional Portable']

# Filter data to only include transactions from these subcategories
df_train = df_train[df_train['prod_subcategory'].isin(top_subcategories)]
df_test = df_test[df_test['cust_id'].isin(df_train['cust_id'])]

# More EDA on Customers

In [None]:
# Find targeted product sub-categories
QUERY =
"""
  SELECT distinct p.prod_subcategory
  FROM `machine_learning.products` p 
  WHERE p.prod_mfc_brand_cd = 'GENM'
"""

# Execute the query
query_job = client.query(QUERY)  # API request

# Save the query result to a DataFrame
Genm_sub_cate = query_job.to_dataframe()  # Waits for query to finish and converts it to DataFrame

### EDA to Identify Full-Value Customers and Cherry Pickers

In [None]:
data = df_train.copy()

cereal_data = data[data['prod_subcategory'] == 'Cereal Rte']
nutritional_data = data[data['prod_subcategory'] == 'Nutritional Portable']

# Calculate the total spending per customer for each subcategory
cereal_spending = cereal_data.groupby('cust_id')['sales_amt'].sum()
nutritional_spending = nutritional_data.groupby('cust_id')['sales_amt'].sum()

# Calculate the variance of spending for each customer in each subcategory
cereal_variance = cereal_spending.var()
nutritional_variance = nutritional_spending.var()

# Display the variances
print('Variance in spending on Cereal Rte:', cereal_variance)
print('Variance in spending on Nutritional Portable:', nutritional_variance)

# Plotting the distribution of spending variance for "Cereal Rte"
sns.histplot(cereal_spending, kde=True, color='blue', label='Cereal Rte')
plt.xlabel('Number of Customers')
plt.ylabel('Variance in Spending')
plt.title('Distribution of Customer Spending on Cereal Rte')
plt.legend()
plt.show()

# Plotting the distribution of spending variance for "Nutritional Portable"
sns.histplot(nutritional_spending, kde=True, color='green', label='Nutritional Portable')
plt.xlabel('Number of Customers')
plt.ylabel('Variance in Spending')
plt.title('Distribution of Customer Spending on Nutritional Portable')
plt.legend()
plt.show()

### EDA to Identify Customers with Low-Purchasing Frequencies

In [None]:
# Calculate the frequency of purchases per customer for each subcategory
cereal_frequency = cereal_data.groupby('cust_id').size()
nutritional_frequency = nutritional_data.groupby('cust_id').size()

# Display the first few entries of the frequency data
print('Frequency data for Cereal Rte:')
print(cereal_frequency.head())
print('Frequency data for Nutritional Portable:')
print(nutritional_frequency.head())

# Plotting the distribution of purchase frequency for "Cereal Rte"
sns.histplot(cereal_frequency, kde=True, color='blue', label='Cereal Rte')
plt.xlabel('Number of Customers')
plt.ylabel('Purchase Frequency')
plt.title('Distribution of Purchase Frequency on Cereal Rte')
plt.legend()
plt.show()

# Plotting the distribution of purchase frequency for "Nutritional Portable"
sns.histplot(nutritional_frequency, kde=True, color='green', label='Nutritional Portable')
plt.xlabel('Number of Customers')
plt.ylabel('Purchase Frequency')
plt.title('Distribution of Purchase Frequency on Nutritional Portable')
plt.legend()
plt.show()

### EDA to Identify Customers with Low Spending

In [None]:
# Calculate the total spending per customer for each subcategory
cereal_spending = cereal_data.groupby('cust_id')['sales_amt'].sum()
nutritional_spending = nutritional_data.groupby('cust_id')['sales_amt'].sum()

# Display the first few entries of the spending data
print('Total spending data for Cereal Rte:')
print(cereal_spending.head())
print('Total spending data for Nutritional Portable:')
print(nutritional_spending.head())

# Plotting the distribution of total spending for "Cereal Rte"
sns.histplot(cereal_spending, kde=True, color='blue', label='Cereal Rte')
plt.xlabel('Number of Customers')
plt.ylabel('Total Spending')
plt.title('Distribution of Total Spending on Cereal Rte')
plt.legend()
plt.show()

# Plotting the distribution of total spending for "Nutritional Portable"
sns.histplot(nutritional_spending, kde=True, color='green', label='Nutritional Portable')
plt.xlabel('Number of Customers')
plt.ylabel('Total Spending')
plt.title('Distribution of Total Spending on Nutritional Portable')
plt.legend()
plt.show()

### EDA to Identify Customers with Low Average Spending

In [None]:
# Calculate the total spending per customer for each subcategory
cereal_spending = cereal_data.groupby('cust_id')['sales_amt'].mean()
nutritional_spending = nutritional_data.groupby('cust_id')['sales_amt'].mean()

# Display the first few entries of the spending data
print('Total spending data for Cereal Rte:')
print(cereal_spending.head())
print('Total spending data for Nutritional Portable:')
print(nutritional_spending.head())

# Plotting the distribution of total spending for "Cereal Rte"
sns.histplot(cereal_spending, kde=True, color='blue', label='Cereal Rte')
plt.xlabel('Number of Customers')
plt.ylabel('Total Spending')
plt.title('Distribution of Total Spending on Cereal Rte')
plt.legend()
plt.show()

# Plotting the distribution of total spending for "Nutritional Portable"
sns.histplot(nutritional_spending, kde=True, color='green', label='Nutritional Portable')
plt.xlabel('Number of Customers')
plt.ylabel('Total Spending')
plt.title('Distribution of Total Spending on Nutritional Portable')
plt.legend()
plt.show()

# Feature Engineering

In [None]:
import pandas as pd

data = df_train.copy()

# Define the subcategories
subcategories = ['Cereal Rte', 'Nutritional Portable']

# Initialize dictionaries to store data
spending_sum_data = {}
frequency_data = {}
spending_variance_data = {}
average_spending_data = {}

# Calculate total spending, purchase frequency, spending variance, and average spending for each customer in each subcategory
for subcat in subcategories:
    subcat_data = data[data['prod_subcategory'] == subcat]
    grouped_data = subcat_data.groupby('cust_id')['sales_amt']
    spending_sum_data[subcat] = grouped_data.sum()
    frequency_data[subcat] = grouped_data.size()
    spending_variance_data[subcat] = grouped_data.var().fillna(0)
    average_spending_data[subcat] = grouped_data.mean()

# Calculate the 20th percentile thresholds for sum of spending, frequency, and variance
spending_sum_thresholds = {subcat: spending_sum_data[subcat].quantile(0.2) for subcat in subcategories}
frequency_thresholds = {subcat: frequency_data[subcat].quantile(0.2) for subcat in subcategories}
variance_thresholds = {subcat: spending_variance_data[subcat].quantile(0.2) for subcat in subcategories}

# Calculate the mean and standard deviation of average spending
average_spending_mean = {subcat: average_spending_data[subcat].mean() for subcat in subcategories}
average_spending_std = {subcat: average_spending_data[subcat].std() for subcat in subcategories}

# Define features based on spending and frequency thresholds
for subcat in subcategories:
    data[f'{subcat}_high_spending_longtail'] = data['cust_id'].map(lambda x: 1 if x in spending_sum_data[subcat] and spending_sum_data[subcat][x] > spending_sum_thresholds[subcat] else 0)
    data[f'{subcat}_high_frequency_longtail'] = data['cust_id'].map(lambda x: 1 if x in frequency_data[subcat] and frequency_data[subcat][x] > frequency_thresholds[subcat] else 0)

# Define the No_Full_value_customer feature based on variance threshold and average spending above 1 std
for subcat in subcategories:
    data[f'{subcat}_No_Full_value_customer'] = data['cust_id'].apply(
        lambda x: 0 if (
            x in spending_variance_data[subcat] and
            spending_variance_data[subcat][x] <= variance_thresholds[subcat] and
            x in average_spending_data[subcat] and
            average_spending_data[subcat][x] > (average_spending_mean[subcat] + average_spending_std[subcat])
        ) else 1
    )

# Display the first few rows to verify the new features
print(data.head())

# Update the training dataset
df_train = data.copy()

# Naive Bayesian Recommendation System Model

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Load the training dataset
NB_train = df_train.copy()

# Combine the relevant columns from both datasets for fitting
all_categories = pd.concat([NB_train['prod_subcategory'], df_test['prod_subcategory']]).unique()
label_encoder.fit(all_categories)

NB_train['prod_subcategory_encoded'] = label_encoder.transform(NB_train['prod_subcategory'])
df_test['prod_subcategory_encoded'] = label_encoder.transform(df_test['prod_subcategory'])

# Train the model on available features that are useful for prediction
model = MultinomialNB()
features = ['cust_id', 'month','dayofweek','prod_subcategory_encoded', 'sales_amt', 'prod_uom_value', "Cereal Rte_high_spending_longtail", 
            "Cereal Rte_high_frequency_longtail", "Cereal Rte_No_Full_value_customer", "Nutritional Portable_high_spending_longtail", 
            "Nutritional Portable_high_frequency_longtail", "Nutritional Portable_No_Full_value_customer"]  
X_train = NB_train[features]
y_train = NB_train['prod_id']
model.fit(X_train, y_train)

# Filter only Kellogg's products for recommendation
kellogg_products = NB_train[NB_train['prod_mfc_brand_cd'] == "KLGS"]['prod_id'].unique()

# Predict probabilities for the training dataset
train_probs = model.predict_proba(X_train)

# Map probabilities to product IDs, focusing only on Kellogg's products
product_ids = model.classes_  # array of all possible product IDs predicted by the model
kellogg_indices = [i for i, prod_id in enumerate(product_ids) if prod_id in kellogg_products]

# Extracting top 5 Kellogg's recommendations for each customer
top_5_recommendations = pd.DataFrame(train_probs[:, kellogg_indices], index=train_df.index, columns=kellogg_products).apply(lambda x: x.nlargest(5).index.tolist(), axis=1)
NB_train['top_5_recommendations'] = top_5_recommendations

# Print out the recommendations output using Naive Bayesian Model
NB_recommendations = NB_train[['cust_id', 'top_5_recommendations']]


# Content-Based Recommendation System Model

In [None]:
# Load the training dataset
CB_train = df_train.copy()

# Combine product features for the modeling process
combined_features = ['cust_id', 'prod_subcategory', 'sales_amt', 'prod_uom_value', 
                     'Cereal Rte_high_spending_longtail', 'Cereal Rte_high_frequency_longtail', 
                     'Cereal Rte_No_Full_value_customer', 'Nutritional Portable_high_spending_longtail', 
                     'Nutritional Portable_high_frequency_longtail','Nutritional Portable_No_Full_value_customer']

# Adjusting for potential non-string (including NaN) values in product features
CB_train = CB_train[combined_features].fillna('').astype(str).apply(lambda x: ' '.join(x), axis=1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# Step 1: Vectorize the combined features
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(CB_train)

# Step 2: Compute cosine similarity matrix for the products
product_similarity = cosine_similarity(tfidf_matrix)

# Step 3: Recommend products for each customer based on their previous interactions
# We'll first need a mapping from indices to product IDs
prod_index_to_id = CB_train['prod_id'].tolist()
prod_id_to_index = {pid: idx for idx, pid in enumerate(prod_index_to_id)}

# Create a dictionary to hold the recommendations for each customer
recommendations = defaultdict(list)

# Iterate through each transaction in the training data
for index, row in df_train.iterrows():
    cust_id = row['cust_id']
    prod_id = row['prod_id']
    current_prod_index = prod_id_to_index[prod_id]

    # Get similarity scores for this product with all others
    similarity_scores = list(enumerate(product_similarity[current_prod_index]))

    # Sort the products based on the similarity scores
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the top 5 product indices
    top_product_indices = [prod_index for prod_index, score in sorted_scores[1:6]]

    # Map indices back to product IDs
    recommended_prod_ids = [prod_index_to_id[i] for i in top_product_indices]

    # Add to recommendations for the customer
    recommendations[cust_id].extend(recommended_prod_ids)

# Ensure distinct and take only top 5 per customer
for cust_id in recommendations:
    recommendations[cust_id] = list(dict.fromkeys(recommendations[cust_id]))[:5]

# Step 4: Prepare the output DataFrame
CB_recommendations = pd.DataFrame([(cust, pid) for cust, prods in recommendations.items() for pid in prods], 
                         columns=['cust_id', 'top_5_recommendations'])


# Collaborative Filtering Recommendation System Model

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Load training table
CF_train = df_train.copy()

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the 'sales_qty' and 'sales_amt' columns
CF_train[['sales_qty_normalized', 'sales_amt_normalized', 'sales_wgt_normalized']] = scaler.fit_transform(CF_train[['sales_qty', 'sales_amt', 'sales_wgt']])

# Display the first few rows to verify the normalization
CF_train[['sales_qty', 'sales_amt','sales_wgt', 'sales_qty_normalized', 'sales_amt_normalized', 'sales_wgt_normalized']].head()

In [None]:
feature_cols = [
    'Cereal Rte_high_spending_longtail',
    'Cereal Rte_high_frequency_longtail',
    'Nutritional Portable_high_spending_longtail',
    'Nutritional Portable_high_frequency_longtail',
    'Cereal Rte_No_Full_value_customer',
    'Nutritional Portable_No_Full_value_customer'
]

# Normalize the new features (this is a simple min-max normalization for illustration purposes)
scaler = MinMaxScaler()
CF_train[feature_cols] = scaler.fit_transform(CF_train[feature_cols])

In [None]:
# Calculate Item-User Matrix
from scipy.sparse import csr_matrix

# Map customer IDs and product IDs to categorical codes
cust_map = {cust_id: i for i, cust_id in enumerate(CF_train['cust_id'].unique())}
prod_map = {prod_id: i for i, prod_id in enumerate(CF_train['prod_id'].unique())}

# Create row (customer codes), column (product codes), and data arrays (normalized sales quantities)
row = CF_train['cust_id'].map(cust_map)
col = CF_train['prod_id'].map(prod_map)
data = CF_train['sales_qty_normalized'].values

weights = {
    'sales_qty': 1,
    'sales_amt': 1,
    'sales_wgt': 0,
    'feature_weight': 1  # Weight for the new binary features
}

combined_interaction_score = (
    weights['sales_qty'] * CF_train['sales_qty_normalized'] +
    weights['sales_amt'] * CF_train['sales_amt_normalized'] +
    weights['sales_wgt'] * CF_train['sales_wgt_normalized'] +
    weights['feature_weight'] * CF_train[feature_cols].sum(axis=1)  # Summing all new feature columns
) / (len(feature_cols) + 2)  # Dividing by the total number of features to average the scores (ignore weight for now)

# Now, create the interaction matrix using the combined_interaction_score
interaction_matrix_sparse = csr_matrix(
    (combined_interaction_score.values, (row, col)),
    shape=(len(cust_map), len(prod_map))
)

In [None]:
# Item-item Similarities
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Transpose the matrix to get item-user format
item_user_matrix_sparse = interaction_matrix_sparse.T

# Compute item-item cosine similarity
item_similarity_matrix = cosine_similarity(item_user_matrix_sparse, dense_output=False)

In [None]:
# Filter the DataFrame for 'KLGS' items
klgs_items = CF_train[CF_train['prod_mfc_brand_cd'] == 'KLGS']

# Create a set of 'prod_id' for 'KLGS' items
klgs_item_ids = set(klgs_items['prod_id'])

# Map these to the indices used in 'prod_map'
klgs_item_indices = {prod_map[prod_id] for prod_id in klgs_item_ids if prod_id in prod_map}

In [None]:
def make_recommendations_sparse(user_index, interaction_matrix_sparse, item_similarity_matrix, top_n, klgs_item_indices):
    # Step 1: Identify items the user has interacted with
    interacted_indices = interaction_matrix_sparse.getrow(user_index).nonzero()[1]

    # Step 2: Aggregate similar items' scores
    item_scores = {}
    for item_idx in interacted_indices:
        # Retrieve the similarity scores for this item against all others
        similar_items_scores = item_similarity_matrix.getrow(item_idx).toarray().ravel()

        # Iterate through each item's similarity score to this item
        for similar_item_idx, score in enumerate(similar_items_scores):
            if score <= 0 or similar_item_idx not in klgs_item_indices:
                continue  # Skip if no similarity or item is not a 'KLGS' item
            if similar_item_idx in interacted_indices:
                continue  # Skip items the user has already interacted with

            item_scores[similar_item_idx] = item_scores.get(similar_item_idx, 0) + score

    # Step 3: Rank and recommend the top N items
    recommended_items_indices = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Map indices back to product IDs if necessary
    recommended_items_ids = [list(prod_map.keys())[list(prod_map.values()).index(idx)] for idx, _ in recommended_items_indices]

    return recommended_items_ids

In [None]:
recommendations = []
for user_id in random_selected_users:
    if user_id in cust_map:  # Ensure the user is in the cust_map
        user_index = cust_map[user_id]  # Get user index
        # Generate top 5 recommendations
        top_n_recommendations = make_recommendations_sparse(user_index, interaction_matrix_sparse, item_similarity_matrix, top_n=5, klgs_item_indices=klgs_item_indices)
        # Store the recommendations
        recommendations.append({"cust_id": user_id, "recommended_items": top_n_recommendations})
    else:
        # Handle case where user ID is not found in cust_map
        recommendations.append({"cust_id": user_id, "recommended_items": []})

In [None]:
# We'll create a new DataFrame, 'recommendations_expanded_df', with the expanded format
recommendations_expanded = []

for entry in recommendations:
    cust_id = entry['cust_id']
    for rec_item in entry['recommended_items']:
        recommendations_expanded.append({'cust_id': cust_id, 'top_5_recommendations': rec_item})

# Convert the expanded list to a DataFrame
CF_recommendations = pd.DataFrame(recommendations_expanded)

# Model Evaluation

In [None]:
# Load the testing table
test_df = df_test.copy()

NB_test = test_df[test_df['cust_id'].isin(NB_recommendations['cust_id'])]
CB_test = test_df[test_df['cust_id'].isin(CB_recommendations['cust_id'])]
CF_test = test_df[test_df['cust_id'].isin(CF_recommendations['cust_id'])]

# Merge test data to include recommendations
NB_test = NB_test.merge(NB_recommendations, on='cust_id', how='left')
CB_test = CB_test.merge(CB_recommendations, on='cust_id', how='left')
CF_test = CF_test.merge(CF_recommendations, on='cust_id', how='left')

# Create a column to check if each purchased product is in the top 5 recommendations
# Check if 'top_5_recommendations' is a list and handle NaN values
NB_test['is_purchased'] = NB_test.apply(lambda row: row['prod_id'] in row['top_5_recommendations'] if isinstance(row['top_5_recommendations'], list) else False, axis=1)
CB_test['is_purchased'] = CB_test.apply(lambda row: row['prod_id'] in row['top_5_recommendations'] if isinstance(row['top_5_recommendations'], list) else False, axis=1)
CF_test['is_purchased'] = CF_test.apply(lambda row: row['prod_id'] in row['top_5_recommendations'] if isinstance(row['top_5_recommendations'], list) else False, axis=1)

# Calculate accuracy for each customer and the average accuracy
NB_customer_accuracy = NB_test.groupby('cust_id')['is_purchased'].sum().reset_index(name='purchases')
NB_customer_accuracy['accuracy'] = NB_customer_accuracy['purchases'] / 5
NB_average_accuracy = NB_customer_accuracy['accuracy'].mean()

CB_customer_accuracy = CB_test.groupby('cust_id')['is_purchased'].sum().reset_index(name='purchases')
CB_customer_accuracy['accuracy'] = CB_customer_accuracy['purchases'] / 5
CB_average_accuracy = CB_customer_accuracy['accuracy'].mean()

CF_customer_accuracy = CF_test.groupby('cust_id')['is_purchased'].sum().reset_index(name='purchases')
CF_customer_accuracy['accuracy'] = CF_customer_accuracy['purchases'] / 5
CF_average_accuracy = CF_customer_accuracy['accuracy'].mean()

print(f"Naive Bayesian Average accuracy: {NB_average_accuracy}")
print(f"Content-Based Average accuracy: {CB_average_accuracy}")
print(f"Collaborative Filtering Average accuracy: {CF_average_accuracy}")

# Calculate hit rate for each transaction
NB_test['hit'] = NB_test['is_purchased'].astype(int)
NB_hit_rate = NB_test['hit'].sum() / len(NB_test)

CB_test['hit'] = CB_test['is_purchased'].astype(int)
CB_hit_rate = CB_test['hit'].sum() / len(CB_test)

CF_test['hit'] = CF_test['is_purchased'].astype(int)
CF_hit_rate = CF_test['hit'].sum() / len(CF_test)

print('')
print(f"Naive Bayesian Transaction-based hit rate: {NB_hit_rate}")
print(f"Content-Based Transaction-based hit rate: {CB_hit_rate}")
print(f"Collaborative Filtering Transaction-based hit rate: {CF_hit_rate}")

# Calculate the proportion of customers who bought recommended products
NB_customers_with_hits = NB_test[NB_test['is_purchased']].drop_duplicates('cust_id').shape[0]
NB_total_customers = NB_recommendations.shape[0]
NB_purchase_rate = NB_customers_with_hits / NB_total_customers

CB_customers_with_hits = CB_test[CB_test['is_purchased']].drop_duplicates('cust_id').shape[0]
CB_total_customers = CB_recommendations.shape[0]
CB_purchase_rate = CB_customers_with_hits / CB_total_customers

CF_customers_with_hits = CF_test[NB_test['is_purchased']].drop_duplicates('cust_id').shape[0]
CF_total_customers = CF_recommendations.shape[0]
CF_purchase_rate = CF_customers_with_hits / CF_total_customers

print('')
print(f"Naive Bayesian Customer-based purchase rate: {NB_purchase_rate}")
print(f"Content-Based Customer-based purchase rate: {CB_purchase_rate}")
print(f"Collaborative Filtering Customer-based purchase rate: {CF_purchase_rate}")


# Profit Estimation and Growth

In [None]:
QUERY = (
"""
SELECT sum(sales_amt)
FROM `machine_learning.transactions_clean` t JOIN `machine_learning.products` p ON t.prod_id = p.prod_id
WHERE prod_mfc_brand_cd = 'GENM' AND trans_dt between '2017-01-01' AND '2019-12-31' AND (prod_subcategory = 'Cereal Rte' OR prod_subcategory = 'Nutritional Portable')
"""
)

query_job = client.query(QUERY)  # API request
profit = query_job.to_dataframe()  # Converts the query result directly into a pandas DataFrame

# Optional: Display the first few rows to confirm it's what you expect
profit.head()

In [None]:
# Load the training data
data = df_train.copy()

# Define the subcategories
subcategories = ['Cereal Rte', 'Nutritional Portable']

# Initialize a dictionary to store results
proportion_of_all_features_one = {}

# Iterate through each subcategory to calculate the proportion
for subcat in subcategories:
    # Filter data for the current subcategory
    subcat_data = data[data['prod_subcategory'] == subcat]

    # Calculate the proportion of customers with all three features as 1
    # Count the number of customers where all three features are 1
    count_all_features_one = subcat_data[
        (subcat_data[f'{subcat}_high_spending_longtail'] == 1) &
        (subcat_data[f'{subcat}_high_frequency_longtail'] == 1) &
        (subcat_data[f'{subcat}_No_Full_value_customer'] == 1)
    ].groupby('cust_id').size().count()

    # Count the total number of unique customers in this subcategory
    total_customers = subcat_data['cust_id'].nunique()

    # Calculate the proportion
    proportion = count_all_features_one / total_customers if total_customers > 0 else 0
    proportion_of_all_features_one[subcat] = proportion

# Print the results for each subcategory
for subcat, proportion in proportion_of_all_features_one.items():
    print(f'Proportion of customers with all features set to 1 in {subcat}: {proportion:.2f}')


In [None]:
## construct a BigQuery client object
client = bigquery.Client(credentials=credentials)

QUERY ="""
  SELECT sum(sales_amt)
  FROM `machine_learning.transactions_clean` t
  JOIN `machine_learning.products` p ON t.prod_id = p.prod_id
  WHERE t.trans_dt <= '2019-12-31'
"""

# Execute the query
query_job = client.query(QUERY)  # API request

# Save the query result to a DataFrame
df = query_job.to_dataframe()  # Waits for query to finish and converts it to DataFrame

In [None]:
# Assume df is your DataFrame
df = pd.DataFrame({'column': [3.270555e+07]})  # Example initialization

# Calculate and display in normal format
result = df * 0.55 * 0.0168
pd.options.display.float_format = '{:.2f}'.format
print(df)