In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..

/home/iovcharenko/Documents/NotWork/UCU/liner-algebra/ucu-linear-algebra-final-project


In [3]:
import json

import numpy as np
import pandas as pd

from pathlib import Path
from IPython.display import Markdown

from src.metrics import rmse
from src.utils import movies_info_preproc


In [4]:
data_folder = Path("data")
results_folder = Path("results")
figs_folder = Path("figs")

In [5]:
results_df = pd.read_csv(results_folder / "15-02-21" / "val_preds.csv")

df = pd.read_csv(data_folder / "subsets" / "low-sparsity" / "records.csv")
tr_df = df[df.split == "train"].copy()
val_df = df[df.split == "val"].copy()

movies_df = pd.read_csv(data_folder / "subsets" / "low-sparsity" / "movies.csv")
df = pd.merge(
    df,
    movies_df,
    left_on="movie_id",
    right_on="subset_id"
).drop(
    columns="subset_id"
)


movies_titles_df = pd.read_csv(
    data_folder / "meta" / "movie_titles.csv", 
    names=["movie_id", "release_date", "movie_name"],
    encoding="iso-8859-1"
)
df = pd.merge(
    df,
    movies_titles_df,
    left_on="netflix_id",
    right_on="movie_id",
    suffixes=("", "_copy")
).drop(
    columns="movie_id_copy"
)


movies_info_df = pd.read_csv(data_folder / "meta" / "movie_info.csv")
movies_info_df = movies_info_preproc(movies_info_df)
movies_info_df = pd.merge(
    movies_info_df,
    df.groupby(["movie_name", "movie_id"]) \
      .size().reset_index().drop(columns=0),
    on="movie_name",
)
# movies_info_df["quick_desc"] = movies_info_df.apply(lambda r: f"{r.movie_name} ({r.genre})", axis=1)

def collapse_momovie_name(mn):
    if len(mn) > 23:
        return mn[:20] + "..."
    return mn
movies_info_df["quick_desc"] = movies_info_df \
    .apply(lambda r: f"{collapse_momovie_name(r.movie_name)} ({r.genre.split(',')[0]})", axis=1)

In [6]:
def df_with_meta(df):
    return df[df.movie_id.isin(movies_info_df.movie_id.unique())].reset_index(drop=True)

def get_user_top_movies(user, k, df, rating_field="rating"):
    user_df = df[df.customer_id == user.customer_id].sort_values([rating_field, "date"])
    result_columns = ["movie_id", rating_field]
    low_movies, top_movies = (
        user_df.iloc[:k][result_columns].reset_index(drop=True),
        user_df.iloc[-k:][result_columns].reset_index(drop=True),
    )
    
    low_movies = pd.merge(low_movies, movies_info_df, on="movie_id", how="left")[["quick_desc", rating_field]]
    top_movies = pd.merge(top_movies, movies_info_df, on="movie_id", how="left")[["quick_desc", rating_field]]
    
    low_movies.columns = ["movie", "rating"]
    top_movies.columns = ["movie", "rating"]
    
    low_movies = low_movies.sort_values("rating", ascending=False).reset_index(drop=True)
    top_movies = top_movies.sort_values("rating", ascending=False).reset_index(drop=True)
    return low_movies, top_movies




In [7]:
algorithms = ['als-svd', 'funk-svd', 'nnmf', 'neural-svd', 'surprise-svd']
info = {algo: {} for algo in algorithms}


for algo_name in algorithms:
    score_per_user = results_df \
                    .groupby("customer_id") \
                    .apply(lambda df: rmse(df.rating, df[algo_name])) \
                    .sort_values() \
                    .reset_index().rename(columns={0: "rmse_score"})
            
    best_user = score_per_user.iloc[0]
    best_user_hist_low, best_user_hist_top = \
        get_user_top_movies(best_user, 3, df_with_meta(tr_df), "rating")
    _, best_user_recommended = \
        get_user_top_movies(best_user, 3, df_with_meta(results_df), algo_name)

    
    
    worst_user = score_per_user.iloc[-1]
    worst_user_hist_low, worst_user_hist_top = \
        get_user_top_movies(worst_user, 3, df_with_meta(tr_df), "rating")
    _, worst_user_recommended = \
        get_user_top_movies(worst_user, 3, df_with_meta(results_df), algo_name)
    
    info[algo_name] = {
        "best_user": {
            "rmse": best_user.rmse_score,
            "worst_rated_hist": best_user_hist_low.to_dict(orient="records"),
            "best_rated_hist": best_user_hist_top.to_dict(orient="records"),
            "recommended": best_user_recommended.to_dict(orient="records"),
        },
        "worst_user": {
            "rmse": worst_user.rmse_score,
            "worst_rated_hist": worst_user_hist_low.to_dict(orient="records"),
            "best_rated_hist": worst_user_hist_top.to_dict(orient="records"),
            "recommended": worst_user_recommended.to_dict(orient="records"),
        },
    }
    
    display(Markdown(f"### {algo_name}"))
    display(pd.concat(
        [
            pd.concat(
                [best_user_hist_low, best_user_hist_top, best_user_recommended],
                axis=1, keys=["Worst rated", "Top rated", "Recommended"]
            ),

            pd.concat(
                [worst_user_hist_low, worst_user_hist_top, worst_user_recommended],
                axis=1, keys=["Worst rated", "Top rated", "Recommended"]
            ),
        ],
        axis=0,
        keys=["Best predicted user", "Worst predicted user"],
    ).round(3))


### als-svd

Unnamed: 0_level_0,Unnamed: 1_level_0,Worst rated,Worst rated,Top rated,Top rated,Recommended,Recommended
Unnamed: 0_level_1,Unnamed: 1_level_1,movie,rating,movie,rating,movie,rating
Best predicted user,0,Ghost (Fantasy),5.0,Swimming Pool (Crime),5.0,The Sandlot (Comedy),5.0
Best predicted user,1,Thin Line Between Lo... (Comedy),5.0,Halloween II (Horror),5.0,The Family Man (Comedy),5.0
Best predicted user,2,Daredevil (Fantasy),4.0,Along Came Polly (Comedy),5.0,Don't Say a Word (Thriller),5.0
Worst predicted user,0,Robin and Marian (Action),1.0,Sixteen Candles (Comedy),5.0,Shrek 2 (Animation),4.746
Worst predicted user,1,Casualties of War (Drama),1.0,All of Me (Comedy),5.0,Lethal Weapon (Adventure),4.504
Worst predicted user,2,Bowling for Columbine (Documentary),1.0,Moonstruck (Comedy),5.0,Ghost (Fantasy),4.105


### funk-svd

Unnamed: 0_level_0,Unnamed: 1_level_0,Worst rated,Worst rated,Top rated,Top rated,Recommended,Recommended
Unnamed: 0_level_1,Unnamed: 1_level_1,movie,rating,movie,rating,movie,rating
Best predicted user,0,Ghost (Fantasy),5.0,Swimming Pool (Crime),5.0,Coach Carter (Drama),5.0
Best predicted user,1,Thin Line Between Lo... (Comedy),5.0,Halloween II (Horror),5.0,The Sandlot (Comedy),5.0
Best predicted user,2,Daredevil (Fantasy),4.0,Along Came Polly (Comedy),5.0,American Beauty (Drama),5.0
Worst predicted user,0,Robin and Marian (Action),1.0,Sixteen Candles (Comedy),5.0,Shrek 2 (Animation),4.043
Worst predicted user,1,Casualties of War (Drama),1.0,All of Me (Comedy),5.0,Lethal Weapon (Adventure),3.939
Worst predicted user,2,Bowling for Columbine (Documentary),1.0,Moonstruck (Comedy),5.0,Life as a House (Drama),3.902


### nnmf

Unnamed: 0_level_0,Unnamed: 1_level_0,Worst rated,Worst rated,Top rated,Top rated,Recommended,Recommended
Unnamed: 0_level_1,Unnamed: 1_level_1,movie,rating,movie,rating,movie,rating
Best predicted user,0,S.W.A.T. (Action),2.0,The Professional (Crime),5.0,The Godfather (Drama),4.088
Best predicted user,1,Lethal Weapon (Adventure),2.0,Harold and Kumar Go ... (Comedy),4.0,Reservoir Dogs (Crime),4.052
Best predicted user,2,Ghost Ship (Horror),2.0,Coffee and Cigarettes (Comedy),4.0,American Beauty (Drama),3.917
Worst predicted user,0,End of Days (Action),1.0,Planes (Animation),5.0,The Godfather (Drama),3.851
Worst predicted user,1,Dracula (Romance),1.0,The Talented Mr. Ripley (Thriller),5.0,Ghost Dog: The Way o... (Crime),3.66
Worst predicted user,2,One True Thing (Drama),1.0,The Missing (Thriller),5.0,Nothing to Lose (Action),3.613


### neural-svd

Unnamed: 0_level_0,Unnamed: 1_level_0,Worst rated,Worst rated,Top rated,Top rated,Recommended,Recommended
Unnamed: 0_level_1,Unnamed: 1_level_1,movie,rating,movie,rating,movie,rating
Best predicted user,0,Ghost (Fantasy),5.0,Swimming Pool (Crime),5.0,The Family Man (Comedy),5.0
Best predicted user,1,Thin Line Between Lo... (Comedy),5.0,Halloween II (Horror),5.0,Don't Say a Word (Thriller),5.0
Best predicted user,2,Daredevil (Fantasy),4.0,Along Came Polly (Comedy),5.0,Taking Lives (Action),5.0
Worst predicted user,0,Robin and Marian (Action),1.0,Sixteen Candles (Comedy),5.0,Love Actually (Comedy),3.682
Worst predicted user,1,Casualties of War (Drama),1.0,All of Me (Comedy),5.0,28 Days Later (Horror),3.682
Worst predicted user,2,Bowling for Columbine (Documentary),1.0,Moonstruck (Comedy),5.0,Presumed Innocent (Thriller),3.682


### surprise-svd

Unnamed: 0_level_0,Unnamed: 1_level_0,Worst rated,Worst rated,Top rated,Top rated,Recommended,Recommended
Unnamed: 0_level_1,Unnamed: 1_level_1,movie,rating,movie,rating,movie,rating
Best predicted user,0,Ghost (Fantasy),5.0,Swimming Pool (Crime),5.0,American Beauty (Drama),5.0
Best predicted user,1,Thin Line Between Lo... (Comedy),5.0,Halloween II (Horror),5.0,The Family Man (Comedy),5.0
Best predicted user,2,Daredevil (Fantasy),4.0,Along Came Polly (Comedy),5.0,Taking Lives (Action),5.0
Worst predicted user,0,What Women Want (Comedy),1.0,Elephant (Crime),5.0,The Third Man (Thriller),4.762
Worst predicted user,1,The Wedding Planner (Comedy),1.0,Outfoxed: Rupert Mur... (Documentary),5.0,Roger & Me (Documentary),4.488
Worst predicted user,2,The People vs. Larry... (Drama),1.0,Uncovered: The Whole... (Documentary),5.0,Batman Begins (Action),4.42


In [8]:
# with open(data_folder /  "info.json", "w") as f:
#     json.dump(info, f)