In [30]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from zipfile import ZipFile

In [31]:
with ZipFile("targets.zip", "r") as zip_targets:
    zip_targets.extract("targets.csv")

targets_df = pd.read_csv("targets.csv")
targets_df

Unnamed: 0,UserId,ItemId
0,0006246bee,01d2404d4c
1,0006246bee,03d43fdf92
2,0006246bee,0808a9666b
3,0006246bee,0a5d7dd6f6
4,0006246bee,0bab4a8104
...,...,...
616195,fffffe98d0,f6e4113a95
616196,fffffe98d0,f8cc22edf7
616197,fffffe98d0,fa71aa74e9
616198,fffffe98d0,fca8263961


In [32]:
with ZipFile("ratings.zip", "r") as zip_ratings:
    zip_ratings.extract("ratings.jsonl")

with open("ratings.jsonl", "r") as f:
    ratings = [json.loads(line) for line in f]

ratings_df = pd.DataFrame(ratings)
ratings_df

Unnamed: 0,UserId,ItemId,Timestamp,Rating
0,c4ca4238a0,91766eac45,1381010450,8
1,c81e728d9d,5c739554f7,1376756798,9
2,c81e728d9d,48f6d7ce7c,1376746107,8
3,c81e728d9d,e9318d627a,1371310689,1
4,a87ff679a2,17e6357973,1391210879,8
...,...,...,...,...
659715,3d7e93cbd0,450f41b5d3,1487376619,7
659716,3d7e93cbd0,80d1dae630,1497048389,7
659717,7804b284a3,0759d2567b,1419741829,4
659718,6648728db7,28fb7af42b,1370221047,1


In [33]:
n_users = len(ratings_df["UserId"].unique())
n_items = len(ratings_df["ItemId"].unique())
print(f"Number of users: {n_users}")
print(f"Number of items: {n_items}")

Number of users: 51671
Number of items: 29674


In [34]:
def make_dataframe(contents):
    """
    Create a DataFrame from a list of dictionaries.

    Args:
        contents (list): A list of dictionaries.

    Returns:
        pd.DataFrame: A DataFrame containing the data from the dictionaries.
    """
    if contents is None:
        return None
    else:
        contents_df = pd.DataFrame(contents)

        # Columns to drop
        drop = [
            "Production",
            "Website",
            "Response",
            "totalSeasons",
            "Season",
            "Episode",
            "seriesID",
            "Poster",
            "Released",
            "Ratings",
            "Awards",
            "DVD",
            "BoxOffice",
        ]
        contents_df.drop(columns=drop, inplace=True)

        # Replacing incomplete values to NaN
        contents_df.replace("N/A", np.nan, inplace=True)
        contents_df.replace("None", np.nan, inplace=True)

        # Removing characters to convert to numeric values
        contents_df.rename(columns={"Runtime": "Runtime (min)"}, inplace=True)
        replace = {
            "Year": "-",
            "imdbVotes": ",",
            "imdbRating": "",
            "Runtime (min)": " min",
        }
        for key, value in replace.items():
            contents_df[key] = contents_df[key].str.replace(value, "", regex=False)

        # Converting to numeric
        numeric_columns = [
            "Year",
            "imdbVotes",
            "imdbRating",
            "Metascore",
            "Runtime (min)",
        ]
        for col in numeric_columns:
            contents_df[col] = pd.to_numeric(contents_df[col], errors="coerce")

        # Filling NaN values
        fill_values = {
            "Rated": "Not_Rated",
            "Metascore": 0,
            "Runtime (min)": 0,
            "imdbRating": 0,
            "imdbVotes": 0,
            "Year": 0,
        }
        contents_df.fillna(fill_values, inplace=True)

        contents_df["Rated"] = contents_df["Rated"].str.upper()
        contents_df["Rated"] = contents_df["Rated"].str.replace(
            "NOT RATED", "NOT_RATED"
        )

        int_columns = [
            "Year",
            "imdbVotes",
            "Metascore",
            "Runtime (min)",
        ]
        for col in int_columns:
            contents_df[col] = contents_df[col].astype(int)

        # Removing non-alphanumrics characters
        chars = ",;:-()[]{}'\""
        for col in contents_df.columns:
            if contents_df[col].dtype == "object":
                for c in chars:
                    contents_df[col] = contents_df[col].astype(str).str.replace(c, "")

        # Converting country names to a single name
        countries = {
            "United States": "USA",
            "United Kingdom": "UK",
            "United Arab Emirates": "UAE",
            "Republic of Ireland": "Ireland",
            "South Korea": "Korea",
            "People's Republic of China": "China",
        }
        for key, value in countries.items():
            contents_df["Country"] = contents_df["Country"].str.replace(key, value)

    return contents_df

In [35]:
with ZipFile("content.zip", "r") as zip_content:
    zip_content.extract("content.jsonl")

with open("content.jsonl", "r") as f:
    contents = [json.loads(line) for line in f]

contents_df = make_dataframe(contents)
contents_df

Unnamed: 0,ItemId,Title,Year,Rated,Runtime (min),Genre,Director,Writer,Actors,Plot,Language,Country,Metascore,imdbRating,imdbVotes,Type
0,c9f0f895fb,Edison Kinetoscopic Record of a Sneeze,1894,NOT_RATED,1,Documentary Short,William K.L. Dickson,,Fred Ott,A man Edisons assistant takes a pinch of snuff...,,USA,0,5.5,1980,movie
1,d3d9446802,Leaving the Factory,1895,NOT_RATED,1,Documentary Short,Louis Lumière,,,A man opens the big gates to the Lumière facto...,,France,0,6.9,6633,movie
2,c20ad4d76f,The Arrival of a Train,1896,NOT_RATED,1,Documentary Short,Auguste Lumière Louis Lumière,,Madeleine Koehler Marcel Koehler Mrs. Auguste ...,A group of people are standing in a straight l...,,France,0,7.5,11407,movie
3,8e296a067a,The Oxford and Cambridge University Boat Race,1895,NOT_RATED,0,Short News Sport,Birt Acres,,,Although the content of this film is primitive...,,UK,0,4.2,39,movie
4,54229abfcf,The House of the Devil,1896,NOT_RATED,3,Short Horror,Georges Méliès,Georges Méliès,Jehanne dAlcy JulesEugène Legris Georges Méliès,A bat flies into an ancient castle and transfo...,,France,0,6.7,3268,movie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38007,6c0ffc79d0,Yara,2021,TV14,96,Crime Drama Thriller,Marco Tullio Giordana,Graziano Diana Giacomo Martelli,Isabella Ragonese Alessio Boni Thomas Trabacchi,A determined prosecutor becomes consumed with ...,Italian,Italy,0,6.2,3202,movie
38008,e02f371f8c,Lords of Scam,2021,NOT_RATED,105,Documentary Crime,Guillaume Nicloux,Olivier Bouchara,,This documentary traces the rise and crash of ...,French,France,0,6.3,418,movie
38009,8c2a2a22b8,Cash,2021,NOT_RATED,118,Comedy Drama,Rishab Seth,Vishesh Bhatt Rishab Seth Aarsh Vora,Amol Parashar Smiriti Kalra Gulshan Grover,The government announces demonetization. The s...,Hindi,India,0,7.2,1779,movie
38010,ae74ba6bb7,Sompoy,2021,NOT_RATED,120,Comedy Romance,Anawat Phromchae Aroonakorn Pick,Anawat Phromchae Aroonakorn Pick,Pijakkana Wongsarattanasin Tanapol Jarujittran...,A love triangle story of a young woman named S...,Thai,Thailand,0,0.0,5,movie


In [36]:
def get_features(df, ItemId):
    """
    Get the features of an item from a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the item features.
        ItemId (str): The ID of the item.

    Returns:
        dict: A dictionary of the item's features.
    """
    item = df[df["ItemId"] == ItemId]
    if item.empty:
        return None

    category_columns = [col for col in df.columns if df[col].dtype == "object"]
    category_columns.remove("ItemId")
    features = {}
    for col in category_columns:
        features[col] = item[col].values[0]
    # Convert NaN values to empty strings
    for key, value in features.items():
        if value == "nan" or pd.isna(value):
            features[key] = ""
    return features


def features_to_str(features):
    """
    Convert a dictionary of features to a string.

    Args:
        features (dict): A dictionary of features.

    Returns:
        str: A string of features.
    """
    if features is None:
        return ""
    text = ""
    for key, value in features.items():
        text += f"{str(value).lower()} "

    return " ".join(text.split())

In [37]:
features = get_features(contents_df, "7adfbaeadf")
print(features)
print(features_to_str(features))

{'Title': 'The Making of Rocky vs. Drago', 'Rated': 'NOT_RATED', 'Genre': 'Documentary', 'Director': 'John Herzfeld', 'Writer': '', 'Actors': 'Brigitte Nielsen Talia Shire Sylvester Stallone', 'Plot': '', 'Language': 'English', 'Country': 'USA', 'Type': 'movie'}
the making of rocky vs. drago not_rated documentary john herzfeld brigitte nielsen talia shire sylvester stallone english usa movie


In [38]:
def get_tfidf(df):
    """
    Calculates the Term Frequency-Inverse Document Frequency for the features of each item in the dataframe.

    Args:
        df (pd.DataFrame): Dataframe containing item features.

    Returns:
        scipy.sparse.csr_matrix: TF-IDF matrix.
        list: Feature names.
    """
    vectorizer = TfidfVectorizer()

    # Apply features_to_str to each item's features
    item_features_text = []
    for index, row in df.iterrows():
        item_id = row["ItemId"]
        features = get_features(df, item_id)
        item_features_text.append(features_to_str(features))

    tfidf_matrix = vectorizer.fit_transform(item_features_text)
    feature_names = vectorizer.get_feature_names_out()

    return tfidf_matrix, feature_names

In [39]:
tfidf_matrix, feature_names = get_tfidf(contents_df)

# Print the shape of the tfidf matrix and the number of feature names
print("Shape of TF-IDF Matrix:", tfidf_matrix.shape)
print("Number of Feature Names:", len(feature_names))

Shape of TF-IDF Matrix: (38012, 149493)
Number of Feature Names: 149493


In [40]:
items_idx = dict(zip(contents_df["ItemId"], range(len(contents_df))))
features_idx = dict(zip(feature_names, range(len(feature_names))))
users_idx = dict(zip(ratings_df["UserId"].unique(), range(len(ratings_df))))

In [41]:
def get_item_vector(item_id, items_idx=items_idx, tfidf_matrix=tfidf_matrix):
    """
    Get the vector representation of an item.

    Args:
        item_id (str): The ID of the item.
        items_idx (dict): A dictionary mapping item IDs to their index in the tfidf matrix.
        tfidf_matrix (scipy.sparse.csr_matrix): The TF-IDF matrix.

    Returns:
        numpy.ndarray: The vector representation of the item.
    """
    if item_id not in items_idx:
        return None
    item_index = items_idx[item_id]
    item_vector = tfidf_matrix[item_index].toarray()[0]
    return item_vector


def get_user_vector(
    user_id, ratings_df, items_idx=items_idx, tfidf_matrix=tfidf_matrix
):
    """
    Get the vector representation of a user.

    Args:
        user_id (str): The ID of the user.
        ratings_df (pd.DataFrame): The ratings dataframe.
        items_idx (dict): A dictionary mapping item IDs to their index in the tfidf matrix.
        tfidf_matrix (scipy.sparse.csr_matrix): The TF-IDF matrix.

    Returns:
        numpy.ndarray: The vector representation of the user.
    """
    user_ratings = ratings_df[ratings_df["UserId"] == user_id]
    if user_ratings.empty:
        return None

    user_vector = np.zeros(tfidf_matrix.shape[1])
    for index, row in user_ratings.iterrows():
        item_id = row["ItemId"]
        rating = row["Rating"]
        item_vector = get_item_vector(item_id)
        if item_vector is not None:
            user_vector += item_vector * rating

    user_vector /= len(user_ratings)
    return user_vector

In [42]:
print(get_item_vector("7adfbaeadf").shape)

(149493,)


In [43]:
print(get_user_vector("7804b284a3", ratings_df).shape)

(149493,)


In [44]:
def cos_sim(v1, v2):
    """
    Calculate the cosine similarity between two vectors.

    Args:
        v1 (numpy.ndarray): The first vector.
        v2 (numpy.ndarray): The second vector.

    Returns:
        float: The cosine similarity between the two vectors.
    """
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [45]:
def get_utility_matrix(ratings_df, contents_df):
    """
    Create a utility matrix from a ratings dataframe.

    Args:
        ratings_df (pd.DataFrame): The ratings dataframe.
        contents_df (pd.DataFrame): The contents dataframe.

    Returns:
        scipy.sparse.csr_matrix: The utility matrix.
    """
    # Create user and item mappings
    user_map = {user: i for i, user in enumerate(ratings_df["UserId"].unique())}
    item_map = {item: i for i, item in enumerate(contents_df["ItemId"])}

    # Extract data for sparse matrix
    rows = [user_map[user] for user in ratings_df["UserId"]]
    cols = [item_map[item] for item in ratings_df["ItemId"]]
    data = ratings_df["Rating"]

    # Create the sparse matrix
    sparse_matrix = sparse.csr_matrix(
        (data, (rows, cols)), shape=(len(user_map), len(item_map)), dtype=np.int8
    )
    return sparse_matrix


def n_ratings(user_index, utility_matrix):
    """
    Get the number of ratings for a user.

    Args:
        user_index (int): The index of the user.
        utility_matrix (scipy.sparse.csr_matrix): The utility matrix.

    Returns:
        int: The number of ratings for the user.
    """
    user_ratings = utility_matrix[user_index, :].nonzero()[1]
    return len(user_ratings)

In [46]:
utility_matrix = get_utility_matrix(ratings_df, contents_df)
utility_matrix

<51671x38012 sparse matrix of type '<class 'numpy.int8'>'
	with 659720 stored elements in Compressed Sparse Row format>

In [47]:
items_idx = dict(zip(contents_df["ItemId"], range(len(contents_df))))

In [48]:
def get_item_vector(item_id, items_idx, tfidf_matrix):
    """
    Get the vector representation of an item.

    Args:
        item_id (str): The ID of the item.
        contents_df (pd.DataFrame): DataFrame containing item features.
        tfidf_matrix (scipy.sparse.csr_matrix): The TF-IDF matrix.

    Returns:
        numpy.ndarray: The vector representation of the item.
    """
    if item_id not in items_idx:
        return None
    item_index = items_idx[item_id]
    item_vector = tfidf_matrix[item_index].toarray()[0]
    return item_vector


print(get_item_vector("7adfbaeadf", items_idx, tfidf_matrix).shape)

(149493,)


In [49]:
def get_user_vector(
    user_id, ratings_df, utility_matrix=utility_matrix, tfidf_matrix=tfidf_matrix
):
    """
    Get the vector representation of a user.

    Args:
        user_id (str): The ID of the user.
        ratings_df (pd.DataFrame): DataFrame containing user-item ratings.
        items_idx (dict): A dictionary mapping item IDs to their index in the tfidf matrix.
        utility_matrix (scipy.sparse.csr_matrix): The utility matrix.
        tfidf_matrix (scipy.sparse.csr_matrix): The TF-IDF matrix.

    Returns:
        numpy.ndarray: The vector representation of the user.
    """
    user_map = {user: i for i, user in enumerate(ratings_df["UserId"].unique())}
    u = utility_matrix[user_map[user_id], :].tocsr()
    user_vector = np.dot(u, tfidf_matrix).toarray()
    return user_vector


def get_item_ranking(user_id, targets_df, user_vector, tfidf_matrix, top=100):
    """
    Get the ranking of items for a user.

    Args:
        user_id (str): The ID of the user.
        targets_df (pd.DataFrame): DataFrame containing user-item targets.
        user_vector (numpy.ndarray): The vector representation of the user.
        tfidf_matrix (scipy.sparse.csr_matrix): The TF-IDF matrix.
        top (int): The number of top items to consider.

    Returns:
        dict: A dictionary of the top items for the user.
    """
    item_map = {item: i for i, item in enumerate(contents_df["ItemId"])}
    user_targets = targets_df.loc[targets_df["UserId"] == user_id, "ItemId"].tolist()

    r_ui = dict(zip(user_targets, np.zeros(len(user_targets))))
    for item_id in user_targets:
        item_vector = get_item_vector(item_id, item_map, tfidf_matrix)
        r_ui[item_id] = cos_sim(user_vector.flatten(), item_vector.flatten())

    item_ranking = {
        k: v for k, v in sorted(r_ui.items(), key=lambda item: item[1], reverse=True)
    }
    top_items = dict(list(item_ranking.items())[:top])
    return top_items

In [50]:
user_vector = get_user_vector("fffffe98d0", ratings_df).flatten()
item_vector = get_item_vector("7adfbaeadf", items_idx, tfidf_matrix).flatten()
print(user_vector.shape)
print(item_vector.shape)
print(cos_sim(user_vector, item_vector))

(149493,)
(149493,)
0.03888967937161518


In [51]:
# Print iterations progress
def printProgressBar(
    iteration,
    total,
    prefix="",
    suffix="",
    decimals=1,
    length=100,
    fill="█",
    printEnd="\r",
):
    """
    Call in a loop to create terminal progress bar

    Args:
        iteration (int): current iteration
        total (int): total iterations
        prefix (str): prefix string
        suffix (str): suffix string
        decimals (int): positive number of decimals in percent complete
        length (int): character length of bar
        fill (str): bar fill character
        printEnd (str): end character (e.g. "\r", "\r\n")
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + "-" * (length - filledLength)
    print(f"\r{prefix} |{bar}| {percent}% {suffix}", end=printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()

In [58]:
user_rankings = {}
users_id = list(targets_df["UserId"].unique())
l = len(users_id)
printProgressBar(0, l, prefix="Progress:", suffix="Complete", length=50)

for i, user_id in enumerate(users_id):
    user_vector = get_user_vector(user_id, ratings_df).flatten()
    user_rankings[user_id] = get_item_ranking(
        user_id, targets_df, user_vector, tfidf_matrix
    )
    printProgressBar(i + 1, l, prefix="Progress:", suffix="Complete", length=50)

Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [60]:
# Convert user_rankings dictionary to a list of tuples
user_item_pairs = [(user, item) for user, items in user_rankings.items() for item in items.keys()]

# Create a DataFrame from the list of tuples
user_rankings_df = pd.DataFrame(user_item_pairs, columns=['UserId', 'ItemId'])

user_rankings_df

Unnamed: 0,UserId,ItemId
0,0006246bee,dac5553444
1,0006246bee,c1ee6829f5
2,0006246bee,dcba99a1a6
3,0006246bee,9f7c438b9d
4,0006246bee,b804592040
...,...,...
616195,fffffe98d0,08e9114142
616196,fffffe98d0,7479705807
616197,fffffe98d0,4abac6ea8d
616198,fffffe98d0,476cda8d4e


In [61]:
user_rankings_df.to_csv("user_rankings.csv", index=False)