In [1]:
import argparse
import csv
import math
from collections import defaultdict

In [2]:
user_items = dict()
with open("train-1.txt", "r", encoding="utf-8") as g:
    for line in g:
        parts = line.strip().split()
        if len(parts) < 2:
            # user with no items -> skip
            continue
        user = parts[0]
        items = list(dict.fromkeys(parts[1:]))  # unique, preserve order
        user_items[user] = set(items)

In [3]:
N = len(user_items)  # number of users
co_counts = defaultdict(lambda: defaultdict(int))
item_freq = defaultdict(int)
items_all = set()

for u, items in user_items.items():
    if not items:
        continue
    items_list = sorted(items)  # stable ordering to enforce i<j
    L = len(items_list)
    for it in items_list:
        item_freq[it] += 1
        items_all.add(it)
    for idx in range(L):
        i = items_list[idx]
        for jdx in range(idx + 1, L):
            j = items_list[jdx]
            co_counts[i][j] += 1

# Build symmetric similarities
neighbors = defaultdict(list)  # item -> list[(other_item, sim)]
# Gather potential pairs from co_counts (only pairs with c>0)
for i, row in co_counts.items():
    a = item_freq[i]
    for j, c in row.items():
        b = item_freq[j]
        # Edge cases: if an item is interacted by no user or all users, variance is zero
        if a == 0 or b == 0 or a == N or b == N:
            sim = 0.0
        num = c - (a * b) / float(N)
        denom = math.sqrt(a * (1.0 - a / float(N)) * b * (1.0 - b / float(N)))
        if denom == 0.0:
            sim = 0.0
        sim = num / denom
        if sim != 0.0:
            neighbors[i].append((j, sim))
            neighbors[j].append((i, sim))

# For items that never co-occurred with others, make sure they exist in neighbors
for it in items_all:
    neighbors[it] = neighbors[it]  # touch to ensure key exists

# Keep only top-k by |similarity| for stability
for it, nbrs in neighbors.items():
    nbrs.sort(key=lambda x: abs(x[1]), reverse=True)
    neighbors[it] = nbrs[:100]

In [None]:
recs = dict()
for u, items_u in user_items.items():
    scores = defaultdict(float)
    norm = defaultdict(float)

    for i in items_u:
        for j, sim in neighbors.get(i, []):
            if j in items_u:
                continue
            scores[j] += sim
            norm[j] += abs(sim)

    # Normalized score
    ranked = []
    for j, s in scores.items():
        denom = norm[j] if norm[j] > 0 else 1.0
        ranked.append((j, s / denom))

    ranked.sort(key=lambda x: x[1], reverse=True)
    #return recs

In [9]:
def write_output_csv(recs, out_path):
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["user_id", "recommendations"])
        for u, items in recs.items():
            writer.writerow([u, " ".join(map(str, items))])

In [None]:
def main():
    parser = argparse.ArgumentParser(description="Item-Based kNN (Pearson) recommender for implicit positive interactions.")
    parser.add_argument("--input", type=str, required=True, help="Path to input interactions file (txt).")
    parser.add_argument("--output", type=str, required=True, help="Path to output CSV.")
    parser.add_argument("--k", type=int, default=50, help="Max neighbors per item (default: 50).")
    parser.add_argument("--topN", type=int, default=20, help="Recommendations per user (default: 20).")
    args = parser.parse_args()

    #user_items = read_user_items(args.input)
    #neighbors = build_item_similarity(user_items, k_neighbors=args.k)
    #recs = generate_recommendations(user_items, neighbors, topN=args.topN)
    write_output_csv(recs, args.output)

In [None]:
if __name__ == "__main__":
    main()