In [3]:
import numpy as np
import pandas as pd

In [43]:
def map_data(data):
    if not isinstance(data, (np.ndarray, list)) or len(data) == 0:
        raise ValueError("Input to map_data must be a non-empty 1D array or list.")
    
    # Ensure data is a flattened numpy array
    data = np.array(data).flatten()
    
    uniq = list(set(data))
    id_dict = {old: new for new, old in enumerate(sorted(uniq))}
    mapped_data = np.array([id_dict[x] for x in data])
    return mapped_data, id_dict, len(uniq)


In [25]:
# dtypes = {
#     'u_nodes': np.int64, 'v_nodes': np.int64,
#     'ratings': np.float32, 'timestamp': np.float64}

# use engine='python' to ignore warning about switching to python backend when using regexp for sep
data = pd.read_csv('/home/vinmike/Documents/GitHub/LLM4Rec-Dataloader/data/ml-1m/ratings.dat', sep='::', header=None,
                    names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], engine='python')
                    



In [27]:
# Find rows where 'v_nodes' is not numeric
invalid_rows = data[~data['v_nodes'].str.isnumeric()]
print(invalid_rows)


     u_nodes                                            v_nodes  ratings  \
375        5  17tion (with the help of singular value decomp...        1   

     timestamp  
375  978245763  


In [28]:
# Keep only rows where 'v_nodes' contains valid integers
data = data[data['v_nodes'].str.isnumeric()]

In [29]:

data['v_nodes'] = data['v_nodes'].astype(np.int64)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000208 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   u_nodes    1000208 non-null  int64
 1   v_nodes    1000208 non-null  int64
 2   ratings    1000208 non-null  int64
 3   timestamp  1000208 non-null  int64
dtypes: int64(4)
memory usage: 38.2 MB


In [30]:
data_array = data.values.tolist()

data_array



[[1, 1193, 5, 978300760],
 [1, 661, 3, 978302109],
 [1, 914, 3, 978301968],
 [1, 3408, 4, 978300275],
 [1, 2355, 5, 978824291],
 [1, 1197, 3, 978302268],
 [1, 1287, 5, 978302039],
 [1, 2804, 5, 978300719],
 [1, 594, 4, 978302268],
 [1, 919, 4, 978301368],
 [1, 595, 5, 978824268],
 [1, 938, 4, 978301752],
 [1, 2398, 4, 978302281],
 [1, 2918, 4, 978302124],
 [1, 1035, 5, 978301753],
 [1, 2791, 4, 978302188],
 [1, 2687, 3, 978824268],
 [1, 2018, 4, 978301777],
 [1, 3105, 5, 978301713],
 [1, 2797, 4, 978302039],
 [1, 2321, 3, 978302205],
 [1, 720, 3, 978300760],
 [1, 1270, 5, 978300055],
 [1, 527, 5, 978824195],
 [1, 2340, 3, 978300103],
 [1, 48, 5, 978824351],
 [1, 1097, 4, 978301953],
 [1, 1721, 4, 978300055],
 [1, 1545, 4, 978824139],
 [1, 745, 3, 978824268],
 [1, 2294, 4, 978824291],
 [1, 3186, 4, 978300019],
 [1, 1566, 4, 978824330],
 [1, 588, 4, 978824268],
 [1, 1907, 4, 978824330],
 [1, 783, 4, 978824291],
 [1, 1836, 5, 978300172],
 [1, 1022, 5, 978300055],
 [1, 2762, 4, 978302091],

In [31]:

data_array = np.array(data_array)
data_array

array([[        1,      1193,         5, 978300760],
       [        1,       661,         3, 978302109],
       [        1,       914,         3, 978301968],
       ...,
       [     6040,       562,         5, 956704746],
       [     6040,      1096,         4, 956715648],
       [     6040,      1097,         4, 956715569]])

In [41]:


u_nodes_ratings = data_array[:, 0].astype(np.int64)
v_nodes_ratings = data_array[:, 1].astype(int)
ratings = data_array[:, 2].astype(float)

u_nodes_ratings


array([   1,    1,    1, ..., 6040, 6040, 6040])

In [42]:
u_nodes_ratings

array([   1,    1,    1, ..., 6040, 6040, 6040])

In [44]:

u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)


In [51]:
num_items

3706

In [52]:
import dgl
import torch

# Example of creating a DGLGraph with node and edge features
g = dgl.graph(([0, 1, 2], [1, 2, 3]))  # A graph with edges 0->1, 1->2, 2->3
g.ndata['feat'] = torch.randn(4, 10)   # Node features (4 nodes, 10-dimensional features)
g.edata['feat'] = torch.randn(3, 5)    # Edge features (3 edges, 5-dimensional features)
label = torch.tensor(1)                # Graph-level label


ModuleNotFoundError: No module named 'dgl'

In [53]:
import os
import pickle
import re
import torch
import pandas as pd
from builder import PandasGraphBuilder
from data_utils import *
import dgl

def load_data(directory):
    """
    Load the MovieLens dataset, process it, and return the following:
    - users: A pandas DataFrame containing user information.
    - movies: A pandas DataFrame containing movie information.
    - ratings: A pandas DataFrame containing user-movie interactions.
    """
    # Load users data
    users = []
    with open(os.path.join(directory, "users.dat"), encoding="latin1") as f:
        for l in f:
            id_, gender, age, occupation, zip_ = l.strip().split("::")
            users.append(
                {
                    "user_id": int(id_),
                    "gender": gender,
                    "age": age,
                    "occupation": occupation,
                    "zip": zip_,
                }
            )
    users = pd.DataFrame(users).astype("category")

    # Load movies data
    movies = []
    with open(os.path.join(directory, "movies.dat"), encoding="latin1") as f:
        for l in f:
            id_, title, genres = l.strip().split("::")
            genres_set = set(genres.split("|"))

            # Extract year
            assert re.match(r".*\([0-9]{4}\)$", title)
            year = title[-5:-1]
            title = title[:-6].strip()

            data = {"movie_id": int(id_), "title": title, "year": year}
            for g in genres_set:
                data[g] = True
            movies.append(data)
    movies = pd.DataFrame(movies).astype({"year": "category"})

    # Load ratings data
    ratings = []
    with open(os.path.join(directory, "ratings.dat"), encoding="latin1") as f:
        for l in f:
            user_id, movie_id, rating, timestamp = [
                int(_) for _ in l.split("::")
            ]
            ratings.append(
                {
                    "user_id": user_id,
                    "movie_id": movie_id,
                    "rating": rating,
                    "timestamp": timestamp,
                }
            )
    ratings = pd.DataFrame(ratings)

    return users, movies, ratings

def build_graph(users, movies, ratings):
    """
    Build a heterogeneous graph using DGL and the data loaded from MovieLens.
    """
    # Group the movie features into genres (a vector), year (a category), title (a string)
    genre_columns = movies.columns.drop(["movie_id", "title", "year"])
    movies[genre_columns] = movies[genre_columns].fillna(False).astype("bool")
    movies_categorical = movies.drop("title", axis=1)

    # Build graph
    graph_builder = PandasGraphBuilder()
    graph_builder.add_entities(users, "user_id", "user")
    graph_builder.add_entities(movies_categorical, "movie_id", "movie")
    graph_builder.add_binary_relations(
        ratings, "user_id", "movie_id", "watched"
    )
    graph_builder.add_binary_relations(
        ratings, "movie_id", "user_id", "watched-by"
    )

    g = graph_builder.build()

    # Assign features to nodes and edges
    for data_type in ["gender", "age", "occupation", "zip"]:
        g.nodes["user"].data[data_type] = torch.LongTensor(
            users[data_type].cat.codes.values
        )

    g.nodes["movie"].data["year"] = torch.LongTensor(
        movies["year"].cat.codes.values
    )
    g.nodes["movie"].data["genre"] = torch.FloatTensor(
        movies[genre_columns].values
    )

    for edge_type in ["watched", "watched-by"]:
        for data_type in ["rating", "timestamp"]:
            g.edges[edge_type].data[data_type] = torch.LongTensor(
                ratings[data_type].values
            )

    return g

def save_graph(g, out_directory):
    """
    Save the graph and other data into pickle files.
    """
    # Save the graph to a binary file
    dgl.save_graphs(os.path.join(out_directory, "graph.bin"), g)

    # Save the dataset
    dataset = {
        "graph": g,
        "user-type": "user",
        "item-type": "movie",
        "user-to-item-type": "watched",
        "item-to-user-type": "watched-by",
    }
    with open(os.path.join(out_directory, "data.pkl"), "wb") as f:
        pickle.dump(dataset, f)



ModuleNotFoundError: No module named 'torch'