<a href="https://colab.research.google.com/github/xular13/model_rs/blob/main/rec_sys_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q wordcloud  # For later visualization


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.9 kB[0m [31m16.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.9/547.9 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

# Update paths to your actual file locations
INPUT_DIR = '/content/drive/MyDrive'

# Load ratings data
ratings_df = pd.read_csv(
    f'{INPUT_DIR}/animelist_v2.csv',
    usecols=['user_id', 'anime_id', 'score'],
    dtype={'user_id': 'int32', 'anime_id': 'int32', 'score': 'float32'}
)

# Load anime details
anime_df = pd.read_csv(
    f'{INPUT_DIR}/anime_details_v2.csv',
    usecols=['id', 'title', 'synopsis', 'genres', 'mean', 'media_type', 'english_title'],
    dtype={'id': 'int32', 'mean': 'float32'}
)

print("Ratings columns:", ratings_df.columns.tolist())
print("Anime columns:", anime_df.columns.tolist())

Ratings columns: ['user_id', 'anime_id', 'score']
Anime columns: ['id', 'title', 'synopsis', 'mean', 'genres', 'media_type', 'english_title']


In [None]:
user_r = pd.read_csv(f'{INPUT_DIR}/user_data.csv',
                     usecols=['user_id', 'anime_id', 'score'],
                     dtype={'user_id': 'string', 'anime_id': 'int32', 'score': 'float32'})
user_r.head()

mp = {
    'xular13': ratings_df['user_id'].max() + 1
}
user_r['user_id'] = user_r['user_id'].map(mp)

bfr_shape = ratings_df.shape[0]
ratings_df = pd.concat([ratings_df, user_r], ignore_index=True)
aftr_shape = ratings_df.shape[0]
print(f'Before: {bfr_shape}, After: {aftr_shape}, Diff: {aftr_shape - bfr_shape}')

Before: 110685120, After: 110685201, Diff: 81


In [None]:
# Filter users/animes with sufficient interactions
MIN_RATINGS_PER_USER = 40
MIN_RATINGS_PER_ANIME = 10

# Filter users
user_counts = ratings_df['user_id'].value_counts()
ratings_df = ratings_df[ratings_df['user_id'].isin(user_counts[user_counts >= MIN_RATINGS_PER_USER].index)]

# Filter animes
anime_counts = ratings_df['anime_id'].value_counts()
ratings_df = ratings_df[ratings_df['anime_id'].isin(anime_counts[anime_counts >= MIN_RATINGS_PER_ANIME].index)]

print("Filtered ratings shape:", ratings_df.shape)

Filtered ratings shape: (109982854, 3)


In [None]:
from sklearn.model_selection import train_test_split

# Split first to prevent data leakage
train_df, test_df = train_test_split(
    ratings_df,
    test_size=0.2,
    random_state=42,
    stratify=ratings_df['user_id']  # Maintain user distribution
)

print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 87986283
Test size: 21996571


In [None]:
user_r

Unnamed: 0,user_id,anime_id,score
0,361908,32281,10.0
1,361908,41457,10.0
2,361908,47194,10.0
3,361908,48569,10.0
4,361908,49387,10.0
...,...,...,...
76,361908,52034,7.0
77,361908,52198,7.0
78,361908,38000,6.0
79,361908,40052,6.0


In [None]:
# Calculate min/max from TRAINING set only
train_min = train_df['score'].min()
train_max = train_df['score'].max()

# Scale ratings to [0, 1]
train_df['score_normalized'] = (train_df['score'] - train_min) / (train_max - train_min)
test_df['score_normalized'] = (test_df['score'] - train_min) / (train_max - train_min)

print("Train min/max:", train_min, train_max)
print("Scaled train sample:", train_df['score_normalized'].head(3))

Train min/max: 0.0 10.0
Scaled train sample: 30270192    0.8
97973129    0.8
65198122    0.8
Name: score_normalized, dtype: float32


In [None]:
# Create mappings
user_ids = train_df['user_id'].unique()
anime_ids = train_df['anime_id'].unique()

user2idx = {user: idx for idx, user in enumerate(user_ids)}
anime2idx = {anime: idx for idx, anime in enumerate(anime_ids)}

# Apply encoding
train_df['user'] = train_df['user_id'].map(user2idx)
train_df['anime'] = train_df['anime_id'].map(anime2idx)
test_df = test_df[test_df['user_id'].isin(user2idx.keys()) & test_df['anime_id'].isin(anime2idx.keys())]  # Filter unseen users/animes
test_df['user'] = test_df['user_id'].map(user2idx)
test_df['anime'] = test_df['anime_id'].map(anime2idx)

print("Unique users:", len(user2idx))
print("Unique animes:", len(anime2idx))

Unique users: 292252
Unique animes: 18200


In [None]:
!pip install tensorflow==2.18.0  # Version known to work with Colab TPUs
!pip install tensorflow-tpu==2.18.0 --find-links=https://storage.googleapis.com/libtpu-tf-releases/index.html

Collecting tensorflow==2.18.0
  Downloading tensorflow-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow==2.18.0)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow==2.18.0)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow==2.18.0)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow==2.18.0)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow==2.18.0)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting ml-dtypes<0.5.0,>=0.4.0 (from tensorflow==2.18.0)
  Downloading ml_dtypes-0.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting tensorflow-io-gcs-filesyste

In [None]:
import tensorflow as tf

try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
    print("TPU detected:", resolver.master())
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)
    print("Replicas:", strategy.num_replicas_in_sync)
except ValueError as e:
    print("TPU initialization failed:", e)

 This a JAX bug; please report an issue at https://github.com/jax-ml/jax/issues
  _warn(f"cloud_tpu_init failed: {exc!r}\n This a JAX bug; please report "


TPU detected: 
Replicas: 1


In [None]:
print("Num TPUs:", len(tf.config.list_logical_devices('TPU')))

Num TPUs: 1


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Concatenate, Dense, Dropout, Flatten

# Initialize TPU
#resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu='local')
tpu_strategy = tf.distribute.TPUStrategy(resolver)

with tpu_strategy.scope():
    # Input layers
    user_input = Input(shape=(1,), name='user_input')
    anime_input = Input(shape=(1,), name='anime_input')

    # Embeddings
    user_embedding = Embedding(input_dim=len(user2idx), output_dim=128, name='user_embedding')(user_input)
    anime_embedding = Embedding(input_dim=len(anime2idx), output_dim=128, name='anime_embedding')(anime_input)

    # Concatenate + MLP
    merged = Concatenate()([user_embedding, anime_embedding])
    merged = Flatten()(merged)
    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(0.3)(merged)
    merged = Dense(128, activation='relu')(merged)
    merged = Dropout(0.2)(merged)
    output = Dense(1, activation='linear')(merged)  # Linear for regression

    # Compile
    model = tf.keras.Model(inputs=[user_input, anime_input], outputs=output)
    model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.001), metrics=['mae'])

model.summary()

In [None]:
# Convert DataFrames to TensorFlow datasets
def create_dataset(user_ids, anime_ids, ratings, batch_size=1024):
    dataset = tf.data.Dataset.from_tensor_slices((
        {"user_input": user_ids, "anime_input": anime_ids},
        ratings
    ))
    return dataset.shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Use TPU-friendly batch size (multiples of 128)
batch_size = 4096 * tpu_strategy.num_replicas_in_sync  # Will auto-scale based on TPU cores

train_dataset = create_dataset(
    train_df['user'].values,
    train_df['anime'].values,
    train_df['score_normalized'].values,
    batch_size
)

test_dataset = create_dataset(
    test_df['user'].values,
    test_df['anime'].values,
    test_df['score_normalized'].values,
    batch_size
)

In [None]:
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=10,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, monitor='val_loss'),
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)
    ]
)

Epoch 1/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 12ms/step - loss: 0.0998 - mae: 0.2528 - val_loss: 0.0849 - val_mae: 0.2225 - learning_rate: 0.0010
Epoch 2/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 11ms/step - loss: 0.0840 - mae: 0.2192 - val_loss: 0.0811 - val_mae: 0.2144 - learning_rate: 0.0010
Epoch 3/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 12ms/step - loss: 0.0789 - mae: 0.2086 - val_loss: 0.0805 - val_mae: 0.2129 - learning_rate: 0.0010
Epoch 4/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 12ms/step - loss: 0.0745 - mae: 0.2000 - val_loss: 0.0808 - val_mae: 0.2101 - learning_rate: 0.0010
Epoch 5/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 12ms/step - loss: 0.0710 - mae: 0.1932 - val_loss: 0.0814 - val_mae: 0.2088 - learning_rate: 0.0010
Epoch 6/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 12

In [None]:
import os

# Define the save path
save_dir = f"{INPUT_DIR}/saved_model"

# Create the directory if it doesn’t exist
os.makedirs(save_dir, exist_ok=True)

In [None]:
# Save the entire model (architecture + weights + optimizer state)
model.save(f"{INPUT_DIR}/saved_model/anime_rec_model_v2.keras")

In [None]:
import pickle

# Save mappings (user2idx, anime2idx)
with open(f"{INPUT_DIR}/saved_model/user2idx_v2.pkl", "wb") as f:
    pickle.dump(user2idx, f)

with open(f"{INPUT_DIR}/saved_model/anime2idx_v2.pkl", "wb") as f:
    pickle.dump(anime2idx, f)

# Save min/max used for scaling
np.save(f"{INPUT_DIR}/saved_model/train_min_v2.npy", train_min)
np.save(f"{INPUT_DIR}/saved_model/train_max_v2.npy", train_max)

In [None]:
# Extract embeddings
user_embeddings = model.get_layer("user_embedding").get_weights()[0]
anime_embeddings = model.get_layer("anime_embedding").get_weights()[0]

# Save as numpy arrays
np.save(f"{INPUT_DIR}/saved_model/user_embeddings_v2.npy", user_embeddings)
np.save(f"{INPUT_DIR}/saved_model/anime_embeddings_v2.npy", anime_embeddings)

In [None]:
# Save processed anime details (for mapping IDs to titles/genres)
anime_df.to_parquet(f"{INPUT_DIR}/saved_model/anime_metadata.parquet")

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity

# Load mappings and metadata
def load_artifacts(input_dir):
    artifacts = {}

    # Load model
    artifacts['model'] = tf.keras.models.load_model(f"{input_dir}/saved_model/anime_rec_model_v2.keras")

    # Load encoders
    with open(f"{input_dir}/saved_model/user2idx_v2.pkl", "rb") as f:
        artifacts['user2idx'] = pickle.load(f)
    with open(f"{input_dir}/saved_model/anime2idx_v2.pkl", "rb") as f:
        artifacts['anime2idx'] = pickle.load(f)

    # Load embeddings
    artifacts['user_embeddings'] = np.load(f"{input_dir}/saved_model/user_embeddings_v2.npy")
    artifacts['anime_embeddings'] = np.load(f"{input_dir}/saved_model/anime_embeddings_v2.npy")

    # Load normalization params
    artifacts['train_min'] = np.load(f"{input_dir}/saved_model/train_min_v2.npy")
    artifacts['train_max'] = np.load(f"{input_dir}/saved_model/train_max_v2.npy")

    # Load anime metadata
    artifacts['anime_df'] = pd.read_parquet(f"{input_dir}/saved_model/anime_metadata.parquet")

    return artifacts

artifacts = load_artifacts(INPUT_DIR)

In [None]:
def find_similar_animes(anime_input, n=10, artifacts=artifacts):
    try:
        # Get anime ID from input (name or ID)
        if isinstance(anime_input, str):
            anime_id = artifacts['anime_df'][artifacts['anime_df']['english_title'] == anime_input]['id'].values[0]
        else:
            anime_id = anime_input

        # Get encoded index
        encoded_idx = artifacts['anime2idx'].get(anime_id, -1)
        if encoded_idx == -1:
            return pd.DataFrame()

        # Calculate cosine similarities
        anime_emb = artifacts['anime_embeddings']
        sim_scores = cosine_similarity([anime_emb[encoded_idx]], anime_emb)[0]

        # Get top N similar
        top_indices = sim_scores.argsort()[-n-1:-1][::-1]

        # Build results
        results = []
        for idx in top_indices:
            anime_id = list(artifacts['anime2idx'].keys())[list(artifacts['anime2idx'].values()).index(idx)]
            anime_data = artifacts['anime_df'][artifacts['anime_df']['id'] == anime_id].iloc[0]
            results.append({
                'title': anime_data['english_title'],
                'score': sim_scores[idx],
                'genres': anime_data['genres'],
                'synopsis': anime_data['synopsis']
            })

        return pd.DataFrame(results)

    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()

In [None]:
def get_user_recommendations(user_id, n=10, artifacts=artifacts):
    try:
        print(f"Getting recommendations for user {user_id}...")
        # Convert user ID to encoded index
        encoded_user = artifacts['user2idx'].get(user_id, -1)
        if encoded_user == -1:
            print("User not found.")
            return pd.DataFrame()

        # Get all anime indices as a NumPy array
        all_anime = np.array(list(artifacts['anime2idx'].values()))  # Fix 1: Convert to array

        # Prepare inputs with correct shapes (batch_size, 1)
        user_array = np.full(len(all_anime), encoded_user).reshape(-1, 1)  # Fix 2: Reshape
        anime_array = all_anime.reshape(-1, 1)  # Fix 2: Reshape

        print("User array shape:", user_array.shape)
        print("Anime array shape:", anime_array.shape)

        # Predict ratings
        print("Predicting ratings...")
        preds = artifacts['model'].predict([user_array, anime_array], verbose=0).flatten()  # Fix 3

        # Denormalize and get top N
        preds = preds * (artifacts['train_max'] - artifacts['train_min']) + artifacts['train_min']
        top_indices = preds.argsort()[-n:][::-1]

        # Build results
        results = []
        for idx in top_indices:
            anime_id = list(artifacts['anime2idx'].keys())[idx]
            anime_data = artifacts['anime_df'][artifacts['anime_df']['id'] == anime_id].iloc[0]
            results.append({
                'title': anime_data['english_title'],
                'predicted_rating': preds[idx],
                'genres': anime_data['genres'],
                'synopsis': anime_data['synopsis']
            })

        return pd.DataFrame(results)

    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()

In [None]:
def get_recommendations(input_item, n=10, mode='anime', artifacts=artifacts):
    """
    Unified recommendation function
    Modes: 'anime' (content-based), 'user' (collaborative)
    """
    if mode == 'anime':
        return find_similar_animes(input_item, n, artifacts)
    elif mode == 'user':
        return get_user_recommendations(input_item, n, artifacts)
    else:
        raise ValueError("Invalid mode. Use 'anime' or 'user'")

In [None]:
# Example 1: Anime-based recommendations
anime_recs = get_recommendations("Attack on Titan", mode='anime')
print("Similar to Attack on Titan:")
anime_recs.head()



Similar to Attack on Titan:


Unnamed: 0,title,score,genres,synopsis
0,Tokyo Ghoul,0.829606,"Action, Fantasy, Gore, Horror, Psychological, ...",A sinister threat is invading Tokyo: flesh-eat...
1,Noragami,0.719994,"Action, Mythology, Shounen, Supernatural","In times of need, if you look in the right pla..."
2,Tokyo Ghoul √A,0.717699,"Action, Fantasy, Gore, Horror, Psychological, ...",Ken Kaneki has finally come to accept the mons...
3,My Hero Academia,0.694723,"Action, School, Shounen, Super Power","The appearance of ""quirks,"" newly discovered s..."
4,One Punch Man,0.67745,"Action, Adult Cast, Comedy, Parody, Seinen, Su...",The seemingly unimpressive Saitama has a rathe...


In [None]:
# Example 2: User-based recommendations
user_recs = get_recommendations(361908, n=20, mode='user')  # Replace with real user ID
print("\nRecommendations for User 361908:")
user_recs.head(20)

Getting recommendations for user 361908...
User array shape: (18200, 1)
Anime array shape: (18200, 1)
Predicting ratings...

Recommendations for User 361908:


Unnamed: 0,title,predicted_rating,genres,synopsis
0,Monster,9.205997,"Adult Cast, Drama, Mystery, Psychological, Sei...","Dr. Kenzou Tenma, an elite neurosurgeon recent..."
1,Hunter x Hunter,9.016944,"Action, Adventure, Fantasy, Shounen",Hunters devote themselves to accomplishing haz...
2,Tomorrow's Joe 2,8.94653,"Combat Sports, Drama, Shounen, Sports",Yabuki Joe is left downhearted and hopeless af...
3,Fullmetal Alchemist: Brotherhood,8.870222,"Action, Adventure, Drama, Fantasy, Military, S...",After a horrific alchemy experiment goes wrong...
4,Ping Pong the Animation,8.777449,"Award Winning, Drama, Seinen, Sports","Despite being polar opposites, Makoto ""Smile"" ..."
5,Steins;Gate,8.737535,"Drama, Psychological, Sci-Fi, Suspense, Time T...",Eccentric scientist Rintarou Okabe has a never...
6,,8.701612,"Award Winning, School, Shounen, Sports, Team S...","Shohoku's ""speedster"" and point guard, Ryouta ..."
7,Your Name.,8.699656,"Award Winning, Drama","Mitsuha Miyamizu, a high school girl, yearns t..."
8,,8.697916,"Action, Adventure, Drama, Gore, Historical, Se...",Young Thorfinn grew up listening to the storie...
9,Kaiji: Ultimate Survivor,8.684945,"Adult Cast, High Stakes Game, Psychological, S...",After one of his coworkers fails to repay a de...


In [None]:
print(artifacts['user2idx'].get(361908))

266669
