# Modeling

In [1]:
import pandas as pd
from implicit.als import AlternatingLeastSquares
from scipy.sparse import  coo_matrix
from implicit.nearest_neighbours import bm25_weight

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset_name = "dataset4.csv"

In [None]:
data = pd.read_csv(f"dataset/{dataset_name}")
data

Unnamed: 0,user_id,film_id,rating,liked
0,/kyleharriman/,/film/about-elly/,4.5,0
1,/drowranger060/,/film/raiders-of-the-lost-ark/,5.0,0
2,/drowranger060/,/film/the-empire-strikes-back/,5.0,1
3,/kyleharriman/,/film/saw-3d/,1.5,0
4,/kanemutiny/,/film/funny-girl/,3.5,1
...,...,...,...,...
22336937,/bleary/,/film/grey-gardens/,3.0,0
22336938,/bleary/,/film/eastern-promises/,3.5,0
22336939,/bleary/,/film/mississippi-burning/,3.0,0
22336940,/bleary/,/film/incredibles-2/,3.5,0


In [3]:
import pandas as pd

df_with_na = data[data.isna().any(axis=1)]
df_with_na

Unnamed: 0,user_id,film_id,rating,liked


In [4]:
import pandas as pd
import numpy as np


rng = np.random.default_rng(42)

test_users = rng.choice(
    data["user_id"].unique(),
    size=100,
    replace=False
)
test_df = data[data["user_id"].isin(test_users)]
train_df = data[~data["user_id"].isin(test_users)]

In [None]:
user_means = train_df.groupby('user_id')['rating'].transform('mean')

train_df['rating'] = np.where(
    train_df['rating'] > 0, 
    train_df['rating'], 
    user_means
)

train_df['ratio_score'] = train_df['rating'] / (user_means + 1e-9)

alpha = 40

train_df['raw_score'] = 1 + train_df['ratio_score'] + (train_df['liked'] * 1.5)
train_df['confidence'] = 1 + alpha * train_df['raw_score']

In [6]:
train_df[train_df.isna().any(axis=1)]

Unnamed: 0,user_id,film_id,rating,liked,ratio_score,raw_score,confidence


In [7]:
user_map = {u:i for i,u in enumerate(train_df.user_id.unique())}
item_map = {i:j for j,i in enumerate(train_df.film_id.unique())}

train_df = train_df.copy()
train_df["u"] = train_df.user_id.map(user_map)
train_df["i"] = train_df.film_id.map(item_map)

In [None]:
R = coo_matrix(
    (train_df.ratio_score, (train_df.u, train_df.i)),
    shape=(len(user_map), len(item_map))
)

model = AlternatingLeastSquares(
    factors=64,
    regularization=0.05,
    iterations=50
)
R_weighted = bm25_weight(R, K1=100, B=0.5)
model.fit(R_weighted)

In [9]:
import pickle

with open("model/colaborative/model_v6.pkl", "wb") as f:
    pickle.dump({
        "model": model,
        "user_map": user_map,
        "item_map": item_map,
    }, f)


In [8]:
import pickle
with open('model/colaborative/model_v5.pkl', 'rb') as f :
    obj = pickle.load(f)
model = obj['model']

In [9]:
id_to_film = {idx: film_id for film_id, idx in item_map.items()}
test_df = test_df.copy()
test_df.loc[:, "i"] = test_df["film_id"].map(item_map)

In [None]:
import numpy as np
from scipy.linalg import solve

def infer_user_vector(item_ids, ratings, item_factors, regularization=0.05):
    item_ids = np.asarray(item_ids, dtype=int)
    ratings = np.asarray(ratings, dtype=float) 
    Y = item_factors[item_ids] 
    YtY = np.dot(Y.T, Y) 
    lambda_I = regularization * np.eye(YtY.shape[0])
    Ytr = np.dot(Y.T, ratings)
    user_vec = solve(YtY + lambda_I, Ytr)
    return user_vec

In [14]:
all_recommendations = {}
ranked_seen = {}
all_ranked = {}
for user_id, g in test_df.groupby("user_id"):
    g = g[g.film_id.isin(item_map)]
    if len(g) < 2:
        continue

    item_ids = g["i"].values
    ratings = g["rating"].values

    u_vec = infer_user_vector(item_ids, ratings, model.item_factors)
    scores = model.item_factors @ u_vec
    seen = set(item_ids)
    top10 = [
        i for i in np.argsort(scores)[::-1]
        if i not in seen
    ][:1000]
    list_ranked = [i for i in np.argsort(scores)[::-1]
                   if i in seen
    ]
    ranked = [i for i in np.argsort(scores)[::-1]]
    

    all_recommendations[user_id] = [id_to_film[i] for i in top10]
    ranked_seen[user_id] = [id_to_film[i] for i in list_ranked]
    all_ranked[user_id] = [id_to_film[i] for i in list_ranked]

In [53]:
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix

def get_live_recommendations(user_history_data, model, item_map, N=10):
    ratings = np.array(user_history_data['rating'])
    likes = np.array(user_history_data['liked'])
    film_ids_raw = np.array(user_history_data['film_id'])
    
    valid_indices = []
    valid_mask = []
    
    for i, fid in enumerate(film_ids_raw):
        if fid in item_map:
            valid_indices.append(item_map[fid])
            valid_mask.append(i)
            
        
    ratings = ratings[valid_mask]
    likes = likes[valid_mask]
    
    current_user_mean = 1
    
    unrated_proxy = current_user_mean * 0.9
    rating_proxies = np.where(ratings > 0, ratings, unrated_proxy)
    
    ratio_scores = rating_proxies / (current_user_mean + 1e-9)
    
    alpha = 40 
    raw_scores = 1 + ratio_scores + (likes * 1.5)
    confidences = 1 + alpha * raw_scores
    
    row_indices = np.zeros(len(valid_indices)) 
    col_indices = np.array(valid_indices)
    
    user_interactions = csr_matrix(
        (confidences, (row_indices, col_indices)),
        shape=(1, model.item_factors.shape[0])
    )
    
    
    ids, scores = model.recommend(
        userid=0, 
        user_items=user_interactions, 
        N=N, 
        recalculate_user=True,
        filter_already_liked_items=True
    )
    
    reverse_map = {v: k for k, v in item_map.items()}
    recommended_films = [reverse_map[i] for i in ids]
    return recommended_films

# Evaluate

In [38]:
rating_lookup = (
    test_df
    .set_index(["user_id", "film_id"])["rating"]
    .to_dict()
)

In [17]:
user_rank_tables = {}

for user_id, films in ranked_seen.items():
    rows = []

    for rank, film_id in enumerate(films, start=1):
        r = rating_lookup.get((user_id, film_id))
        if r is None:
            continue

        rows.append({
            "film_id": film_id,
            "model_rank": rank,
            "true_rating": r
        })

    if len(rows) < 2:
        continue

    df = pd.DataFrame(rows)

    df["true_rank"] = (
        df["true_rating"]
        .rank(ascending=False, method="first")
        .astype(int)
    )
    df = df.sort_values("true_rank").reset_index(drop=True)

    user_rank_tables[user_id] = df


In [18]:
def dcg(rels):
    return np.sum((2**rels - 1) / np.log2(np.arange(2, len(rels)+2)))

def ndcg(df, k=10):
    df_k = df.sort_values("model_rank").head(k)
    ideal = df.sort_values("true_rating", ascending=False).head(k)
    return dcg(df_k["true_rating"].values) / dcg(ideal["true_rating"].values)

ndcgs = [
    ndcg(df)
    for df in user_rank_tables.values()
    if len(df) >= 2
]

np.mean(ndcgs)


np.float64(0.7775349208753373)

## Recommend

In [19]:
all_recommendations['/damydc/'][:20]

['/film/a-silent-voice-the-movie/',
 '/film/creed/',
 '/film/a-charlie-brown-christmas/',
 '/film/stand-by-me/',
 '/film/hacksaw-ridge/',
 '/film/turning-red/',
 '/film/chef/',
 '/film/look-back-2024/',
 '/film/battle-royale/',
 '/film/straight-outta-compton/',
 '/film/the-tale-of-the-princess-kaguya/',
 '/film/its-the-great-pumpkin-charlie-brown/',
 '/film/train-to-busan/',
 '/film/rocky/',
 '/film/the-breakfast-club/',
 '/film/chainsaw-man-the-movie-reze-arc/',
 '/film/the-iron-giant/',
 '/film/whisper-of-the-heart/',
 '/film/the-godfather-part-ii/',
 '/film/godzilla-minus-one/']

In [20]:
get_live_recommendations(test_df[test_df['user_id']=='/damydc/'], model, item_map, N=20)

['/film/terrifier-3/',
 '/film/terrifier-2016/',
 '/film/terrifier-2/',
 '/film/zombieland/',
 '/film/shaun-of-the-dead/',
 '/film/wolf-man-2025/',
 '/film/saw-x/',
 '/film/friday-the-13th/',
 '/film/whiplash-2014/',
 '/film/la-la-land/',
 '/film/saw-ii/',
 '/film/a-minecraft-movie/',
 '/film/the-conjuring/',
 '/film/superbad/',
 '/film/insidious/',
 '/film/the-blair-witch-project/',
 '/film/the-conjuring-2/',
 '/film/ferris-buellers-day-off/',
 '/film/shelby-oaks/',
 '/film/scott-pilgrim-vs-the-world/']

In [12]:
test_df[test_df['user_id'] == '/damydc/']

Unnamed: 0,user_id,film_id,rating,liked,i
175974,/damydc/,/film/past-lives/,4.0,1,1073.0
175975,/damydc/,/film/the-boy-and-the-heron/,4.0,1,3068.0
175976,/damydc/,/film/sing-sing-2023/,5.0,1,2800.0
175977,/damydc/,/film/the-substance/,3.0,0,507.0
175978,/damydc/,/film/princess-mononoke/,4.0,1,900.0
...,...,...,...,...,...
4219538,/damydc/,/film/evil-dead-rise/,3.5,1,1780.0
4219541,/damydc/,/film/evil-dead/,4.5,1,591.0
4219544,/damydc/,/film/army-of-darkness/,4.0,1,608.0
4219545,/damydc/,/film/the-amityville-horror-2005/,2.0,0,15421.0


In [None]:

seeder_films = [
	"/film/x-2022/",
	"/film/pearl-2022/",
	"/film/evil-dead/",
	"/film/inside-2007/",
	"/film/i-saw-the-devil/",
	"/film/creep-2014/",
	"/film/barbarian-2022/",
	"/film/the-lost-boys/",
	"/film/evil-dead-rise/",
	"/film/us-2019/",
	"/film/a-classic-horror-story/",
	"/film/longlegs/",
	"/film/terrifier-2/",
	"/film/hell-house-llc/",
	"/film/a-nightmare-on-elm-street/",
	"/film/the-dark-and-the-wicked/",
	"/film/climax-2018/",
	"/film/brightburn/",
	"/film/30-days-of-night/",
	"/film/a-bay-of-blood/",
	"/film/sick-2022/"
]


df_seeder = pd.DataFrame({
    'film_id': seeder_films,
    'rating': 5.0,  
    'liked': 1.0    
})
df_seeder.loc[:, 'i'] = df_seeder['film_id'].map(item_map)
df_seeder

Unnamed: 0,film_id,rating,liked,i
0,/film/stand-by-me/,5.0,1.0,3282


In [55]:
get_live_recommendations(df_seeder, model, item_map, N=100)

['/film/dead-poets-society/',
 '/film/good-will-hunting/',
 '/film/beautiful-boy-2018/',
 '/film/the-perks-of-being-a-wallflower/',
 '/film/the-breakfast-club/',
 '/film/little-miss-sunshine/',
 '/film/ferris-buellers-day-off/',
 '/film/forrest-gump/',
 '/film/the-outsiders/',
 '/film/500-days-of-summer/',
 '/film/the-shawshank-redemption/',
 '/film/mid90s/',
 '/film/almost-famous/',
 '/film/brokeback-mountain/',
 '/film/the-notebook/',
 '/film/donnie-darko/',
 '/film/mysterious-skin/',
 '/film/juno/',
 '/film/the-basketball-diaries/',
 '/film/manchester-by-the-sea/',
 '/film/submarine/',
 '/film/stranger-things-5-the-finale/',
 '/film/the-goonies/',
 '/film/the-truman-show/',
 '/film/edward-scissorhands/',
 '/film/empire-records/',
 '/film/dinner-in-america/',
 '/film/requiem-for-a-dream/',
 '/film/flipped/',
 '/film/back-to-the-future/',
 '/film/the-karate-kid/',
 '/film/diary-of-a-wimpy-kid-rodrick-rules/',
 '/film/october-sky/',
 '/film/back-to-the-future-part-ii/',
 '/film/the-cur