In [1]:
import sqlite3
import pandas as pd
import numpy as np

In [2]:
def optimize_db(conn):
    """Apply SQLite performance optimizations."""
    cursor = conn.cursor()
    cursor.executescript('''
        PRAGMA synchronous = OFF;
        PRAGMA journal_mode = MEMORY;
        PRAGMA temp_store = MEMORY;
        PRAGMA cache_size = 1000000;
    ''')
    conn.commit()

In [None]:
# Connect to the SQLite database
db_path = './Retrieval.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
optimize_db(conn)

cursor.execute('''CREATE TABLE IF NOT EXISTS recommendations (
    model TEXT,
    user_id TEXT,
    business_id TEXT,
    real_label INTEGER,
    UNIQUE (model, user_id, business_id)
);
''')

# Close the connection when done
conn.close()

<sqlite3.Cursor at 0x1f40ffafdc0>

In [5]:
conn = sqlite3.connect(db_path)
query = "SELECT * FROM recommendations"

# initialize an empty dataframe to store the results with "model", "user_id", "business_id", "real_label"
df = pd.DataFrame(columns=["model", "user_id", "business_id", "real_label"])


for chunk in pd.read_sql_query(query, conn, chunksize=10000):
    # Process each DataFrame chunk here
    df = pd.concat([df, chunk])

conn.close()

In [12]:
# ----------------------------
# 1. Filter Positive and Non-Positive Recommendations
# ----------------------------
# We assume df is your DataFrame with columns: model, user_id, business_id, real_label.
df_positive = df[df['real_label'] == 1]
df_non_positive = df[df['real_label'] == 0]

In [13]:
# ----------------------------
# 2. Overlap: Same (User, Item) Pairs (for Positive Recommendations)
# ----------------------------
# Create sets of (user_id, business_id) pairs for each model.
pairs_itemcf = set(df_positive[df_positive['model'] == 'ItemCF'][['user_id', 'business_id']].apply(tuple, axis=1))
pairs_dssm   = set(df_positive[df_positive['model'] == 'DSSM'][['user_id', 'business_id']].apply(tuple, axis=1))

# Intersection of pairs and compute statistics
overlap_pairs = pairs_itemcf.intersection(pairs_dssm)
num_overlap_pairs = len(overlap_pairs)

total_pairs_itemcf = len(pairs_itemcf)
total_pairs_dssm   = len(pairs_dssm)

percent_overlap_itemcf = (num_overlap_pairs / total_pairs_itemcf * 100) if total_pairs_itemcf > 0 else 0
percent_overlap_dssm   = (num_overlap_pairs / total_pairs_dssm * 100) if total_pairs_dssm > 0 else 0

print("Overlap for same (user, item) pairs (Positive Recommendations):")
print(f"  Number of overlapping pairs: {num_overlap_pairs}")
print(f"  Percentage in ItemCF: {percent_overlap_itemcf:.2f}%")
print(f"  Percentage in DSSM:   {percent_overlap_dssm:.2f}%\n")

Overlap for same (user, item) pairs (Positive Recommendations):
  Number of overlapping pairs: 3
  Percentage in ItemCF: 1.13%
  Percentage in DSSM:   2.29%



In [14]:
# ----------------------------
# 3. Overlap: Items (Irrespective of User)
# ----------------------------
# 3a. For Positive Recommendations:
items_itemcf_pos = set(df_positive[df_positive['model'] == 'ItemCF']['business_id'])
items_dssm_pos   = set(df_positive[df_positive['model'] == 'DSSM']['business_id'])

overlap_items_pos = items_itemcf_pos.intersection(items_dssm_pos)
num_overlap_items_pos = len(overlap_items_pos)

total_items_itemcf_pos = len(items_itemcf_pos)
total_items_dssm_pos   = len(items_dssm_pos)

percent_items_itemcf_pos = (num_overlap_items_pos / total_items_itemcf_pos * 100) if total_items_itemcf_pos > 0 else 0
percent_items_dssm_pos   = (num_overlap_items_pos / total_items_dssm_pos * 100) if total_items_dssm_pos > 0 else 0

print("Overlap for items (Positive Recommendations, irrespective of user):")
print(f"  Number of overlapping items: {num_overlap_items_pos}")
print(f"  Percentage in ItemCF: {percent_items_itemcf_pos:.2f}%")
print(f"  Percentage in DSSM:   {percent_items_dssm_pos:.2f}%\n")

# 3b. For Non-Positive Recommendations:
items_itemcf_non = set(df_non_positive[df_non_positive['model'] == 'ItemCF']['business_id'])
items_dssm_non   = set(df_non_positive[df_non_positive['model'] == 'DSSM']['business_id'])

overlap_items_non = items_itemcf_non.intersection(items_dssm_non)
num_overlap_items_non = len(overlap_items_non)

total_items_itemcf_non = len(items_itemcf_non)
total_items_dssm_non   = len(items_dssm_non)

percent_items_itemcf_non = (num_overlap_items_non / total_items_itemcf_non * 100) if total_items_itemcf_non > 0 else 0
percent_items_dssm_non   = (num_overlap_items_non / total_items_dssm_non * 100) if total_items_dssm_non > 0 else 0

print("Overlap for items (Non-Positive Recommendations, irrespective of user):")
print(f"  Number of overlapping non-positive items: {num_overlap_items_non}")
print(f"  Percentage in ItemCF: {percent_items_itemcf_non:.2f}%")
print(f"  Percentage in DSSM:   {percent_items_dssm_non:.2f}%\n")

# 3c. Overall Item Overlap (All Recommendations, regardless of real_label):
items_itemcf_all = set(df[df['model'] == 'ItemCF']['business_id'])
items_dssm_all   = set(df[df['model'] == 'DSSM']['business_id'])

overlap_items_all = items_itemcf_all.intersection(items_dssm_all)
num_overlap_items_all = len(overlap_items_all)

total_items_itemcf_all = len(items_itemcf_all)
total_items_dssm_all   = len(items_dssm_all)

percent_items_itemcf_all = (num_overlap_items_all / total_items_itemcf_all * 100) if total_items_itemcf_all > 0 else 0
percent_items_dssm_all   = (num_overlap_items_all / total_items_dssm_all * 100) if total_items_dssm_all > 0 else 0

print("Overall overlap for items (All Recommendations, irrespective of label):")
print(f"  Number of overlapping items: {num_overlap_items_all}")
print(f"  Percentage in ItemCF: {percent_items_itemcf_all:.2f}%")
print(f"  Percentage in DSSM:   {percent_items_dssm_all:.2f}%\n")

Overlap for items (Positive Recommendations, irrespective of user):
  Number of overlapping items: 8
  Percentage in ItemCF: 3.10%
  Percentage in DSSM:   6.40%

Overlap for items (Non-Positive Recommendations, irrespective of user):
  Number of overlapping non-positive items: 17366
  Percentage in ItemCF: 59.73%
  Percentage in DSSM:   46.98%

Overall overlap for items (All Recommendations, irrespective of label):
  Number of overlapping items: 17374
  Percentage in ItemCF: 59.74%
  Percentage in DSSM:   47.00%



In [15]:
# ----------------------------
# 4. Unique User Overlap for All Recommendations
# ----------------------------
users_itemcf_all = set(df[df['model'] == 'ItemCF']['user_id'])
users_dssm_all   = set(df[df['model'] == 'DSSM']['user_id'])
overlap_users_all = users_itemcf_all.intersection(users_dssm_all)

percent_overlap_itemcf_all = (len(overlap_users_all) / len(users_itemcf_all) * 100) if len(users_itemcf_all) > 0 else 0
percent_overlap_dssm_all   = (len(overlap_users_all) / len(users_dssm_all) * 100) if len(users_dssm_all) > 0 else 0

print("Unique user overlap (All Recommendations):")
print(f"  ItemCF unique users: {len(users_itemcf_all)}")
print(f"  DSSM unique users:   {len(users_dssm_all)}")
print(f"  Overlapping users:   {len(overlap_users_all)}")
print(f"  Percentage overlap in ItemCF: {percent_overlap_itemcf_all:.2f}%")
print(f"  Percentage overlap in DSSM:   {percent_overlap_dssm_all:.2f}%\n")

Unique user overlap (All Recommendations):
  ItemCF unique users: 848
  DSSM unique users:   1000
  Overlapping users:   546
  Percentage overlap in ItemCF: 64.39%
  Percentage overlap in DSSM:   54.60%



In [16]:
# ----------------------------
# 5. Additional Insights for Retrieval Quality
# ----------------------------
# 5a. Jaccard Similarity of Recommended Items per User (for users common to both models)
recs_itemcf = df[df['model'] == 'ItemCF'].groupby('user_id')['business_id'].apply(set).to_dict()
recs_dssm   = df[df['model'] == 'DSSM'].groupby('user_id')['business_id'].apply(set).to_dict()

jaccard_scores = []
common_users = set(recs_itemcf.keys()).intersection(recs_dssm.keys())
for user in common_users:
    set_itemcf = recs_itemcf[user]
    set_dssm = recs_dssm[user]
    union_count = len(set_itemcf.union(set_dssm))
    if union_count > 0:
        jaccard = len(set_itemcf.intersection(set_dssm)) / union_count
        jaccard_scores.append(jaccard)
avg_jaccard = np.mean(jaccard_scores) if jaccard_scores else 0
print(f"Average Jaccard similarity of recommended items per user (all items): {avg_jaccard:.2f}")

# 5b. Distribution: Average Number of Recommended Items per User
items_per_user_itemcf = df[df['model'] == 'ItemCF'].groupby('user_id').size()
items_per_user_dssm   = df[df['model'] == 'DSSM'].groupby('user_id').size()
print("Average number of recommended items per user:")
print(f"  ItemCF: {items_per_user_itemcf.mean():.2f}")
print(f"  DSSM:   {items_per_user_dssm.mean():.2f}")

# 5c. Per-User Overlap Ratio of Recommended Items
overlap_ratios = []
for user in common_users:
    set_itemcf = recs_itemcf[user]
    set_dssm = recs_dssm[user]
    if len(set_itemcf) > 0:
        overlap_ratio = len(set_itemcf.intersection(set_dssm)) / len(set_itemcf)
        overlap_ratios.append(overlap_ratio)
avg_overlap_ratio = np.mean(overlap_ratios) if overlap_ratios else 0
print(f"Average per-user overlap ratio (based on ItemCF recommendations): {avg_overlap_ratio * 100:.2f}%\n")

Average Jaccard similarity of recommended items per user (all items): 0.01
Average number of recommended items per user:
  ItemCF: 201.77
  DSSM:   5000.00
Average per-user overlap ratio (based on ItemCF recommendations): 14.58%



In [17]:

# ----------------------------
# 6. Suggestions for Further Insights
# ----------------------------
print("Additional Insights to Consider:")
print("1. Ranking Comparison: If ranking information is available, compare the average rank differences for overlapping items.")
print("2. Recall/Precision Analysis: Using any available ground truth interactions, evaluate recall, precision, and MRR for each model.")
print("3. Coverage Analysis: Calculate the overall coverage, i.e., what fraction of all available businesses each model retrieves.")
print("4. Diversity Measures: Analyze the diversity of recommendations per user (e.g., by category or location) to see if one model offers more varied results.")
print("5. User-Level Disagreement: Identify users for whom the models disagree significantly (e.g., very low overlap) and analyze common characteristics.")


Additional Insights to Consider:
1. Ranking Comparison: If ranking information is available, compare the average rank differences for overlapping items.
2. Recall/Precision Analysis: Using any available ground truth interactions, evaluate recall, precision, and MRR for each model.
3. Coverage Analysis: Calculate the overall coverage, i.e., what fraction of all available businesses each model retrieves.
4. Diversity Measures: Analyze the diversity of recommendations per user (e.g., by category or location) to see if one model offers more varied results.
5. User-Level Disagreement: Identify users for whom the models disagree significantly (e.g., very low overlap) and analyze common characteristics.
