<a href="https://colab.research.google.com/github/yckamra/MovieLens25m-Recommender/blob/main/User_Dictionary_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("Installing project dependencies...")
!pip install numpy pandas scipy scikit-learn # Basic libraries we want
!pip install implicit # Installs implicit library for matrix factorization
#!pip install torch
print("Dependencies installed successfully.")

Installing project dependencies...
Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2
Dependencies installed successfully.


In [2]:
from google.colab import auth
from google.cloud import storage
import zipfile
import os
import pickle
import pandas as pd
import numpy as np
import implicit
import json

In [3]:
auth.authenticate_user()

!gcloud config set project movielens-recommender-451017

!gsutil cp gs://movielens-data/data.zip /content/

Updated property [core/project].
Copying gs://movielens-data/data.zip...
==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

| [1 files][262.6 MiB/262.6 MiB]                                                
Operation completed over 1 objects/262.6 MiB.                                    


TMDb dataset: https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata?select=tmdb_5000_movies.csv

Big TMDB dataset: https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies/data

MovieLens dataset: https://www.kaggle.com/datasets/garymk/movielens-25m-dataset

In [4]:
!ls -lh /content/

total 263M
-rw-r--r-- 1 root root 263M Jun 17 15:48 data.zip
drwxr-xr-x 1 root root 4.0K Jun 13 13:36 sample_data


In [5]:
zip_path = "/content/data.zip" # File name
extract_to = "/content/data/"  # Where to extract files

# Create directory if it doesn't exist
os.makedirs(extract_to, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_to)

print("Extraction complete! Files are in:", extract_to)
!ls -lh /content/data/

Extraction complete! Files are in: /content/data/
total 8.0K
drwxr-xr-x 4 root root 4.0K Jun 17 15:48 data
drwxr-xr-x 3 root root 4.0K Jun 17 15:48 __MACOSX


In [6]:
!ls -lh /content/data/data/ml-25m

total 1.1G
-rw-r--r-- 1 root root 416M Jun 17 15:48 genome-scores.csv
-rw-r--r-- 1 root root  18K Jun 17 15:48 genome-tags.csv
-rw-r--r-- 1 root root 1.4M Jun 17 15:48 links.csv
-rw-r--r-- 1 root root 2.9M Jun 17 15:48 movies.csv
-rw-r--r-- 1 root root 647M Jun 17 15:48 ratings.csv
-rw-r--r-- 1 root root  11K Jun 17 15:48 README.txt
-rw-r--r-- 1 root root  38M Jun 17 15:48 tags.csv


In [7]:
genome_scores_CSV = "/content/data/data/ml-25m/genome-scores.csv"
genome_tags_CSV = "/content/data/data/ml-25m/genome-tags.csv"
links_CSV = "/content/data/data/ml-25m/links.csv"
movies_CSV = "/content/data/data/ml-25m/movies.csv"
ratings_CSV = "/content/data/data/ml-25m/ratings.csv"
tags_CSV = "/content/data/data/ml-25m/tags.csv"

In [16]:
# Load ratings data
ratings_path = ratings_CSV
ratings_df = pd.read_csv(ratings_path)

print(ratings_df)

          userId  movieId  rating   timestamp
0              1      296     5.0  1147880044
1              1      306     3.5  1147868817
2              1      307     5.0  1147868828
3              1      665     5.0  1147878820
4              1      899     3.5  1147868510
...          ...      ...     ...         ...
25000090  162541    50872     4.5  1240953372
25000091  162541    55768     2.5  1240951998
25000092  162541    56176     2.0  1240950697
25000093  162541    58559     4.0  1240953434
25000094  162541    63876     5.0  1240952515

[25000095 rows x 4 columns]


In [19]:
class User:
  # ---Items in self---
  # userId : ID in MovieLens 25m dataset
  # ratings_df : pandas dataframe containing the users rows within the MovieLens 25m ratings.csv

  def __init__(self, userID=None, ratings_df=None):
    self.userID = userID
    self.ratings_df = ratings_df if ratings_df is not None else pd.DataFrame()

  def get_userID(self):
    return self.userID

  def set_userID(self, ID : int):
    assert isinstance(ID, int), "ID must be an integer"
    self.userID = int(ID)

  def add_row_to_ratings_df(self, row):

    if row["movieId"] in self.ratings_df["movieId"].values:
      return False
    else:
      self.ratings_df = pd.concat([self.ratings_df, pd.DataFrame([row])], ignore_index=True)

    return

def create_all_users_dictionary(ratings_df):
  allUsers = {}
  grouped = ratings_df.groupby("userId")

  for userId, group in grouped:
    user = User(userID=userId, ratings_df=group.copy())  # Store the group slice directly
    allUsers[userId] = user

  return allUsers

allUsers = create_all_users_dictionary(ratings_df)

In [23]:
print(allUsers[1].ratings_df.head())

   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


In [25]:
# Saving dictionary to .plk pickle format
file_path = 'allUsers_data.pkl'
with open(file_path, 'wb') as f: # 'wb' means write in binary mode
    pickle.dump(allUsers, f)
print(f"User dictionary saved to {file_path}")

# Loading pickle file
with open(file_path, 'rb') as f: # 'rb' means read in binary mode
    allUsers = pickle.load(f)
print(f"\nUser dictionary loaded from {file_path}")

# Verify loaded data
print(f"Loaded movie '1' ratings_df: {allUsers[1].ratings_df.head()}")

User dictionary saved to allUsers_data.pkl

User dictionary loaded from allUsers_data.pkl
Loaded movie '1' title:    userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


In [26]:
from google.cloud import storage
def save_dict_to_gcs(dictionary_to_save: dict, bucket_name: str, destination_blob_name: str):
    try:
        auth.authenticate_user()
        print("Authenticated to Google Cloud.")
    except Exception as e:
        print(f"Could not authenticate user (may be running outside Colab or already authenticated): {e}")

    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Pickle the dictionary into a bytes object
    pickled_data = pickle.dumps(dictionary_to_save)

    # Upload the bytes object to GCS
    # 'upload_from_string' will overwrite if the blob exists, or create it if new.
    blob.upload_from_string(pickled_data, content_type='application/octet-stream')

    print(f"Dictionary successfully saved to gs://{bucket_name}/{destination_blob_name}")

In [27]:
bucket_name = "movielens-data"
blob_name = "allUsers_data.pkl"
save_dict_to_gcs(allUsers, bucket_name, blob_name)

Authenticated to Google Cloud.
Dictionary successfully saved to gs://movielens-data/allUsers_data.pkl
