# Test notebook

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import numpy as np
import pandas as pd


INPUT_DIR = '/content/drive/MyDrive'

# Load ratings data
ratings_df = pd.read_csv(
    f'{INPUT_DIR}/animelist_v2.csv',
    usecols=['user_id', 'anime_id', 'score'],
    dtype={'user_id': 'int32', 'anime_id': 'int32', 'score': 'float32'}
)

# Load anime details
anime_df = pd.read_csv(
    f'{INPUT_DIR}/anime_details_v2.csv',
    usecols=['id', 'title', 'synopsis', 'genres', 'mean', 'media_type', 'english_title'],
    dtype={'id': 'int32', 'mean': 'float32'}
)

print("Ratings columns:", ratings_df.columns.tolist())
print("Anime columns:", anime_df.columns.tolist())

Ratings columns: ['user_id', 'anime_id', 'score']
Anime columns: ['id', 'title', 'synopsis', 'mean', 'genres', 'media_type', 'english_title']


In [19]:
user_r = pd.read_csv(f'{INPUT_DIR}/user_data.csv',
                     usecols=['user_id', 'anime_id', 'score'],
                     dtype={'user_id': 'string', 'anime_id': 'int32', 'score': 'float32'})
user_r.head()

mp = {
    'xular13': ratings_df['user_id'].max() + 1
}
user_r['user_id'] = user_r['user_id'].map(mp)

bfr_shape = ratings_df.shape[0]
ratings_df = pd.concat([ratings_df, user_r], ignore_index=True)
aftr_shape = ratings_df.shape[0]
print(f'Before: {bfr_shape}, After: {aftr_shape}, Diff: {aftr_shape - bfr_shape}')

Before: 110685120, After: 110685201, Diff: 81


In [20]:
# Filter users/animes with sufficient interactions
MIN_RATINGS_PER_USER = 40
MIN_RATINGS_PER_ANIME = 10

# Filter users
user_counts = ratings_df['user_id'].value_counts()
ratings_df = ratings_df[ratings_df['user_id'].isin(user_counts[user_counts >= MIN_RATINGS_PER_USER].index)]

# Filter animes
anime_counts = ratings_df['anime_id'].value_counts()
ratings_df = ratings_df[ratings_df['anime_id'].isin(anime_counts[anime_counts >= MIN_RATINGS_PER_ANIME].index)]

print("Filtered ratings shape:", ratings_df.shape)

Filtered ratings shape: (109982854, 3)


In [21]:
from sklearn.model_selection import train_test_split

# Split first to prevent data leakage
train_df, test_df = train_test_split(
    ratings_df,
    test_size=0.2,
    random_state=42,
    stratify=ratings_df['user_id']  # Maintain user distribution
)

print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 87986283
Test size: 21996571


In [6]:
user_r

Unnamed: 0,user_id,anime_id,score
0,361908,32281,10.0
1,361908,41457,10.0
2,361908,47194,10.0
3,361908,48569,10.0
4,361908,49387,10.0
...,...,...,...
76,361908,52034,7.0
77,361908,52198,7.0
78,361908,38000,6.0
79,361908,40052,6.0


In [7]:
# Calculate min/max from TRAINING set only
train_min = train_df['score'].min()
train_max = train_df['score'].max()

# Scale ratings to [0, 1]
train_df['score_normalized'] = (train_df['score'] - train_min) / (train_max - train_min)
test_df['score_normalized'] = (test_df['score'] - train_min) / (train_max - train_min)

print("Train min/max:", train_min, train_max)
print("Scaled train sample:", train_df['score_normalized'].head(3))

Train min/max: 0.0 10.0
Scaled train sample: 30270192    0.8
97973129    0.8
65198122    0.8
Name: score_normalized, dtype: float32


In [8]:
# Create mappings
user_ids = train_df['user_id'].unique()
anime_ids = train_df['anime_id'].unique()

user2idx = {user: idx for idx, user in enumerate(user_ids)}
anime2idx = {anime: idx for idx, anime in enumerate(anime_ids)}

# Apply encoding
train_df['user'] = train_df['user_id'].map(user2idx)
train_df['anime'] = train_df['anime_id'].map(anime2idx)
test_df = test_df[test_df['user_id'].isin(user2idx.keys()) & test_df['anime_id'].isin(anime2idx.keys())]  # Filter unseen users/animes
test_df['user'] = test_df['user_id'].map(user2idx)
test_df['anime'] = test_df['anime_id'].map(anime2idx)

print("Unique users:", len(user2idx))
print("Unique animes:", len(anime2idx))

Unique users: 292252
Unique animes: 18200


In [2]:
!pip install tensorflow==2.18.0  # Version known to work with Colab TPUs
!pip install tensorflow-tpu==2.18.0 --find-links=https://storage.googleapis.com/libtpu-tf-releases/index.html

Looking in links: https://storage.googleapis.com/libtpu-tf-releases/index.html


In [10]:
import tensorflow as tf

try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
    print("TPU detected:", resolver.master())
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)
    print("Replicas:", strategy.num_replicas_in_sync)
except ValueError as e:
    print("TPU initialization failed:", e)

 This a JAX bug; please report an issue at https://github.com/jax-ml/jax/issues
  _warn(f"cloud_tpu_init failed: {exc!r}\n This a JAX bug; please report "


TPU detected: 
Replicas: 1


In [11]:
print("Num TPUs:", len(tf.config.list_logical_devices('TPU')))

Num TPUs: 1


In [12]:
# Initialize TPU
#resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu='local')
tpu_strategy = tf.distribute.TPUStrategy(resolver)

# First model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Concatenate, Dense, Dropout, Flatten



with tpu_strategy.scope():
    # Input layers
    user_input = Input(shape=(1,), name='user_input')
    anime_input = Input(shape=(1,), name='anime_input')

    # Embeddings
    user_embedding = Embedding(input_dim=len(user2idx), output_dim=128, name='user_embedding')(user_input)
    anime_embedding = Embedding(input_dim=len(anime2idx), output_dim=128, name='anime_embedding')(anime_input)

    # Concatenate + MLP
    merged = Concatenate()([user_embedding, anime_embedding])
    merged = Flatten()(merged)
    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(0.3)(merged)
    merged = Dense(128, activation='relu')(merged)
    merged = Dropout(0.2)(merged)
    output = Dense(1, activation='linear')(merged)  # Linear for regression

    # Compile
    model = tf.keras.Model(inputs=[user_input, anime_input], outputs=output)
    model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.001), metrics=['mae'])

model.summary()

In [None]:
# Convert DataFrames to TensorFlow datasets
def create_dataset(user_ids, anime_ids, ratings, batch_size=1024):
    dataset = tf.data.Dataset.from_tensor_slices((
        {"user_input": user_ids, "anime_input": anime_ids},
        ratings
    ))
    return dataset.shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Use TPU-friendly batch size (multiples of 128)
batch_size = 4096 * tpu_strategy.num_replicas_in_sync  # Will auto-scale based on TPU cores

train_dataset = create_dataset(
    train_df['user'].values,
    train_df['anime'].values,
    train_df['score_normalized'].values,
    batch_size
)

test_dataset = create_dataset(
    test_df['user'].values,
    test_df['anime'].values,
    test_df['score_normalized'].values,
    batch_size
)

In [None]:
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=10,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, monitor='val_loss'),
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)
    ]
)

Epoch 1/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 12ms/step - loss: 0.0998 - mae: 0.2528 - val_loss: 0.0849 - val_mae: 0.2225 - learning_rate: 0.0010
Epoch 2/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 11ms/step - loss: 0.0840 - mae: 0.2192 - val_loss: 0.0811 - val_mae: 0.2144 - learning_rate: 0.0010
Epoch 3/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 12ms/step - loss: 0.0789 - mae: 0.2086 - val_loss: 0.0805 - val_mae: 0.2129 - learning_rate: 0.0010
Epoch 4/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 12ms/step - loss: 0.0745 - mae: 0.2000 - val_loss: 0.0808 - val_mae: 0.2101 - learning_rate: 0.0010
Epoch 5/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 12ms/step - loss: 0.0710 - mae: 0.1932 - val_loss: 0.0814 - val_mae: 0.2088 - learning_rate: 0.0010
Epoch 6/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 12

In [23]:
import os

# Define the save path
save_dir = f"{INPUT_DIR}/saved_model"

# Create the directory if it doesn’t exist
os.makedirs(save_dir, exist_ok=True)

In [None]:
# Save the entire model (architecture + weights + optimizer state)
model.save(f"{INPUT_DIR}/saved_model/anime_rec_model_v2.keras")

In [None]:
import pickle

# Save mappings (user2idx, anime2idx)
with open(f"{INPUT_DIR}/saved_model/user2idx_v2.pkl", "wb") as f:
    pickle.dump(user2idx, f)

with open(f"{INPUT_DIR}/saved_model/anime2idx_v2.pkl", "wb") as f:
    pickle.dump(anime2idx, f)

# Save min/max used for scaling
np.save(f"{INPUT_DIR}/saved_model/train_min_v2.npy", train_min)
np.save(f"{INPUT_DIR}/saved_model/train_max_v2.npy", train_max)

In [None]:
# Extract embeddings
user_embeddings = model.get_layer("user_embedding").get_weights()[0]
anime_embeddings = model.get_layer("anime_embedding").get_weights()[0]

# Save as numpy arrays
np.save(f"{INPUT_DIR}/saved_model/user_embeddings_v2.npy", user_embeddings)
np.save(f"{INPUT_DIR}/saved_model/anime_embeddings_v2.npy", anime_embeddings)

In [None]:
# Save processed anime details (for mapping IDs to titles/genres)
anime_df.to_parquet(f"{INPUT_DIR}/saved_model/anime_metadata.parquet")

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity

# Load mappings and metadata
def load_artifacts(input_dir):
    artifacts = {}

    # Load model
    artifacts['model'] = tf.keras.models.load_model(f"{input_dir}/saved_model/anime_rec_model_v2.keras")

    # Load encoders
    with open(f"{input_dir}/saved_model/user2idx_v2.pkl", "rb") as f:
        artifacts['user2idx'] = pickle.load(f)
    with open(f"{input_dir}/saved_model/anime2idx_v2.pkl", "rb") as f:
        artifacts['anime2idx'] = pickle.load(f)

    # Load embeddings
    artifacts['user_embeddings'] = np.load(f"{input_dir}/saved_model/user_embeddings_v2.npy")
    artifacts['anime_embeddings'] = np.load(f"{input_dir}/saved_model/anime_embeddings_v2.npy")

    # Load normalization params
    artifacts['train_min'] = np.load(f"{input_dir}/saved_model/train_min_v2.npy")
    artifacts['train_max'] = np.load(f"{input_dir}/saved_model/train_max_v2.npy")

    # Load anime metadata
    artifacts['anime_df'] = pd.read_parquet(f"{input_dir}/saved_model/anime_metadata.parquet")

    return artifacts

artifacts = load_artifacts(INPUT_DIR)

In [None]:
def find_similar_animes(anime_input, n=10, artifacts=artifacts):
    try:
        # Get anime ID from input (name or ID)
        if isinstance(anime_input, str):
            anime_id = artifacts['anime_df'][artifacts['anime_df']['english_title'] == anime_input]['id'].values[0]
        else:
            anime_id = anime_input

        # Get encoded index
        encoded_idx = artifacts['anime2idx'].get(anime_id, -1)
        if encoded_idx == -1:
            return pd.DataFrame()

        # Calculate cosine similarities
        anime_emb = artifacts['anime_embeddings']
        sim_scores = cosine_similarity([anime_emb[encoded_idx]], anime_emb)[0]

        # Get top N similar
        top_indices = sim_scores.argsort()[-n-1:-1][::-1]

        # Build results
        results = []
        for idx in top_indices:
            anime_id = list(artifacts['anime2idx'].keys())[list(artifacts['anime2idx'].values()).index(idx)]
            anime_data = artifacts['anime_df'][artifacts['anime_df']['id'] == anime_id].iloc[0]
            results.append({
                'title': anime_data['english_title'],
                'score': sim_scores[idx],
                'genres': anime_data['genres'],
                'synopsis': anime_data['synopsis']
            })

        return pd.DataFrame(results)

    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()

In [None]:
def get_user_recommendations(user_id, n=10, artifacts=artifacts):
    try:
        print(f"Getting recommendations for user {user_id}...")
        # Convert user ID to encoded index
        encoded_user = artifacts['user2idx'].get(user_id, -1)
        if encoded_user == -1:
            print("User not found.")
            return pd.DataFrame()

        # Get all anime indices as a NumPy array
        all_anime = np.array(list(artifacts['anime2idx'].values()))  # Fix 1: Convert to array

        # Prepare inputs with correct shapes (batch_size, 1)
        user_array = np.full(len(all_anime), encoded_user).reshape(-1, 1)  # Fix 2: Reshape
        anime_array = all_anime.reshape(-1, 1)  # Fix 2: Reshape

        print("User array shape:", user_array.shape)
        print("Anime array shape:", anime_array.shape)

        # Predict ratings
        print("Predicting ratings...")
        preds = artifacts['model'].predict([user_array, anime_array], verbose=0).flatten()  # Fix 3

        # Denormalize and get top N
        preds = preds * (artifacts['train_max'] - artifacts['train_min']) + artifacts['train_min']
        top_indices = preds.argsort()[-n:][::-1]

        # Build results
        results = []
        for idx in top_indices:
            anime_id = list(artifacts['anime2idx'].keys())[idx]
            anime_data = artifacts['anime_df'][artifacts['anime_df']['id'] == anime_id].iloc[0]
            results.append({
                'title': anime_data['english_title'],
                'predicted_rating': preds[idx],
                'genres': anime_data['genres'],
                'synopsis': anime_data['synopsis']
            })

        return pd.DataFrame(results)

    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()

In [None]:
def get_recommendations(input_item, n=10, mode='anime', artifacts=artifacts):
    """
    Unified recommendation function
    Modes: 'anime' (content-based), 'user' (collaborative)
    """
    if mode == 'anime':
        return find_similar_animes(input_item, n, artifacts)
    elif mode == 'user':
        return get_user_recommendations(input_item, n, artifacts)
    else:
        raise ValueError("Invalid mode. Use 'anime' or 'user'")

In [None]:
# Example 1: Anime-based recommendations
anime_recs = get_recommendations("Attack on Titan", mode='anime')
print("Similar to Attack on Titan:")
anime_recs.head()



Similar to Attack on Titan:


Unnamed: 0,title,score,genres,synopsis
0,Tokyo Ghoul,0.829606,"Action, Fantasy, Gore, Horror, Psychological, ...",A sinister threat is invading Tokyo: flesh-eat...
1,Noragami,0.719994,"Action, Mythology, Shounen, Supernatural","In times of need, if you look in the right pla..."
2,Tokyo Ghoul √A,0.717699,"Action, Fantasy, Gore, Horror, Psychological, ...",Ken Kaneki has finally come to accept the mons...
3,My Hero Academia,0.694723,"Action, School, Shounen, Super Power","The appearance of ""quirks,"" newly discovered s..."
4,One Punch Man,0.67745,"Action, Adult Cast, Comedy, Parody, Seinen, Su...",The seemingly unimpressive Saitama has a rathe...


In [None]:
# Example 2: User-based recommendations
user_recs = get_recommendations(361908, n=20, mode='user')
print("\nRecommendations for User 361908:")
user_recs.head(20)

Getting recommendations for user 361908...
User array shape: (18200, 1)
Anime array shape: (18200, 1)
Predicting ratings...

Recommendations for User 361908:


Unnamed: 0,title,predicted_rating,genres,synopsis
0,Monster,9.205997,"Adult Cast, Drama, Mystery, Psychological, Sei...","Dr. Kenzou Tenma, an elite neurosurgeon recent..."
1,Hunter x Hunter,9.016944,"Action, Adventure, Fantasy, Shounen",Hunters devote themselves to accomplishing haz...
2,Tomorrow's Joe 2,8.94653,"Combat Sports, Drama, Shounen, Sports",Yabuki Joe is left downhearted and hopeless af...
3,Fullmetal Alchemist: Brotherhood,8.870222,"Action, Adventure, Drama, Fantasy, Military, S...",After a horrific alchemy experiment goes wrong...
4,Ping Pong the Animation,8.777449,"Award Winning, Drama, Seinen, Sports","Despite being polar opposites, Makoto ""Smile"" ..."
5,Steins;Gate,8.737535,"Drama, Psychological, Sci-Fi, Suspense, Time T...",Eccentric scientist Rintarou Okabe has a never...
6,,8.701612,"Award Winning, School, Shounen, Sports, Team S...","Shohoku's ""speedster"" and point guard, Ryouta ..."
7,Your Name.,8.699656,"Award Winning, Drama","Mitsuha Miyamizu, a high school girl, yearns t..."
8,,8.697916,"Action, Adventure, Drama, Gore, Historical, Se...",Young Thorfinn grew up listening to the storie...
9,Kaiji: Ultimate Survivor,8.684945,"Adult Cast, High Stakes Game, Psychological, S...",After one of his coworkers fails to repay a de...


---

# Second model

In [None]:
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Flatten, LayerNormalization, Concatenate, Dot, Add
with tpu_strategy.scope():
    # Inputs
    user_input = Input(shape=(1,), name='user_input')
    anime_input = Input(shape=(1,), name='anime_input')

    # Embeddings
    emb_dim = 64
    user_emb = Embedding(input_dim=len(user2idx), output_dim=emb_dim, name='user_embedding')(user_input)
    anime_emb = Embedding(input_dim=len(anime2idx), output_dim=emb_dim, name='anime_embedding')(anime_input)

    # Bias embeddings
    user_bias = Embedding(input_dim=len(user2idx), output_dim=1, name='user_bias')(user_input)
    anime_bias = Embedding(input_dim=len(anime2idx), output_dim=1, name='anime_bias')(anime_input)

    # Dot product (optional shortcut)
    dot_product = Dot(axes=-1)([user_emb, anime_emb])

    # Concatenate all features
    x = Concatenate()([user_emb, anime_emb])
    x = Flatten()(x)

    # MLP
    x = Dense(256, activation='relu', kernel_initializer='he_normal')(x)
    x = Dropout(0.3)(x)
    x = LayerNormalization()(x)

    x = Dense(128, activation='relu', kernel_initializer='he_normal')(x)
    x = Dropout(0.2)(x)
    x = LayerNormalization()(x)

    # Final linear output
    output_mlp = Dense(1, activation='linear')(x)

    # Add biases
    bias_sum = Add()([user_bias, anime_bias, dot_product])
    bias_sum = Flatten()(bias_sum)

    # Combine MLP and bias prediction
    final_output = Add()([output_mlp, bias_sum])

    model_2 = tf.keras.Model(inputs=[user_input, anime_input], outputs=final_output)
    model_2.compile(
        loss='mse',
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=['mae']
    )

model_2.summary()

In [None]:
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Flatten, LayerNormalization, Concatenate, Dot, Add
with tpu_strategy.scope():
    # Inputs
    user_input = Input(shape=(1,), name='user_input')
    anime_input = Input(shape=(1,), name='anime_input')

    # Embeddings
    emb_dim = 32
    user_emb = Embedding(input_dim=len(user2idx), output_dim=emb_dim, name='user_embedding')(user_input)
    anime_emb = Embedding(input_dim=len(anime2idx), output_dim=emb_dim, name='anime_embedding')(anime_input)

    # Bias embeddings
    user_bias = Embedding(input_dim=len(user2idx), output_dim=1, name='user_bias')(user_input)
    anime_bias = Embedding(input_dim=len(anime2idx), output_dim=1, name='anime_bias')(anime_input)

    # Dot product (optional shortcut)
    dot_product = Dot(axes=-1)([user_emb, anime_emb])

    # Concatenate all features
    x = Concatenate()([user_emb, anime_emb])
    x = Flatten()(x)

    # MLP
    x = Dense(256, activation='relu', kernel_initializer='he_normal')(x)
    x = Dropout(0.3)(x)
    x = LayerNormalization()(x)

    x = Dense(128, activation='relu', kernel_initializer='he_normal')(x)
    x = Dropout(0.2)(x)
    x = LayerNormalization()(x)

    # Final linear output
    output_mlp = Dense(1, activation='linear')(x)

    # Add biases
    bias_sum = Add()([user_bias, anime_bias, dot_product])
    bias_sum = Flatten()(bias_sum)

    # Combine MLP and bias prediction
    final_output = Add()([output_mlp, bias_sum])

    model_2 = tf.keras.Model(inputs=[user_input, anime_input], outputs=final_output)
    model_2.compile(
        loss='mse',
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=['mae']
    )

model_2.summary()

In [None]:
history = model_2.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=10,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, monitor='val_loss'),
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)
    ]
)

Epoch 1/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 14ms/step - loss: 0.1126 - mae: 0.2694 - val_loss: 0.0837 - val_mae: 0.2178 - learning_rate: 0.0010
Epoch 2/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 13ms/step - loss: 0.0804 - mae: 0.2126 - val_loss: 0.0788 - val_mae: 0.2071 - learning_rate: 0.0010
Epoch 3/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 13ms/step - loss: 0.0744 - mae: 0.2015 - val_loss: 0.0789 - val_mae: 0.2056 - learning_rate: 0.0010
Epoch 4/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 13ms/step - loss: 0.0722 - mae: 0.1974 - val_loss: 0.0789 - val_mae: 0.2046 - learning_rate: 0.0010
Epoch 5/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 13ms/step - loss: 0.0685 - mae: 0.1904 - val_loss: 0.0783 - val_mae: 0.1983 - learning_rate: 2.0000e-04
Epoch 6/10
[1m21482/21482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def evaluate_model(model, test_df, user2idx, anime2idx, normalize=False, train_min=None, train_max=None):
    """
    Evaluate a model using RMSE, MAE, and R2.
    If normalize=True, rescales predictions to original range using train_min and train_max.
    """
    # Filter for warm users/items
    valid = test_df[test_df['user_id'].isin(user2idx) & test_df['anime_id'].isin(anime2idx)]
    if valid.empty:
        print("No valid samples for evaluation.")
        return None

    # Prepare inputs
    user_input = valid['user_id'].map(user2idx).values
    anime_input = valid['anime_id'].map(anime2idx).values
    y_true = valid['score'].values

    # Predict
    y_pred = model.predict([user_input, anime_input], verbose=0).flatten()

    # Rescale if needed
    if normalize and train_min is not None and train_max is not None:
        y_pred = y_pred * (train_max - train_min) + train_min

    # Metrics
    return {
        'RMSE': round(np.sqrt(mean_squared_error(y_true, y_pred)), 4),
        'MAE': round(mean_absolute_error(y_true, y_pred), 4),
        'R2': round(r2_score(y_true, y_pred), 4)
    }

In [None]:
test_df

Unnamed: 0,user_id,anime_id,score,score_normalized,user,anime
58256243,184275,2030,8.0,0.8,196257,1134
90108836,286479,780,7.0,0.7,129379,3693
10061567,28079,1604,8.0,0.8,159761,642
55166059,174139,34504,0.0,0.0,18922,4531
48458896,152507,132,7.0,0.7,94376,2653
...,...,...,...,...,...,...
107086903,341501,33080,7.0,0.7,251708,282
10275520,28788,376,8.0,0.8,155099,2299
34759090,107850,5039,0.0,0.0,34743,71
54172001,171008,25011,0.0,0.0,61241,3087


In [None]:
results = evaluate_model(
    model=artifacts['model'],
    test_df=test_df,
    user2idx=artifacts['user2idx'],
    anime2idx=artifacts['anime2idx'],
    normalize=True,
    train_min=artifacts['train_min'],
    train_max=artifacts['train_max']
)

print(results)

{'RMSE': np.float64(2.8135), 'MAE': 2.0294, 'R2': 0.4819}


In [None]:
results = evaluate_model(
    model=model_2,
    test_df=test_df,
    user2idx=artifacts['user2idx'],
    anime2idx=artifacts['anime2idx'],
    normalize=True,
    train_min=artifacts['train_min'],
    train_max=artifacts['train_max']
)

print(results)

{'RMSE': np.float64(2.8909), 'MAE': 2.0992, 'R2': 0.453}


In [None]:
results = evaluate_model(
    model=model_2,
    test_df=test_df,
    user2idx=artifacts['user2idx'],
    anime2idx=artifacts['anime2idx'],
    normalize=True,
    train_min=artifacts['train_min'],
    train_max=artifacts['train_max']
)

print(results)

{'RMSE': np.float64(2.7998), 'MAE': 1.9619, 'R2': 0.487}


In [None]:
import os

# Define the save path
save_dir = f"{INPUT_DIR}/saved_model"

# Create the directory if it doesn’t exist
os.makedirs(save_dir, exist_ok=True)

In [None]:
# Save the entire model (architecture + weights + optimizer state)
model_2.save(f"{INPUT_DIR}/saved_model/anime_rec_model_v5.keras")

In [None]:
from google.colab import runtime
runtime.unassign()

# Third model

In [13]:
anime_df.head()

Unnamed: 0,id,title,synopsis,mean,genres,media_type,english_title
0,38483,Ore wo Suki nano wa Omae dake ka yo,"Amatsuyu ""Jouro"" Kisaragi is a completely aver...",7.3,"Comedy, Harem, Romance, School",tv,ORESUKI Are you the only one who loves me?
1,38691,Dr. Stone,After five years of harboring unspoken feeling...,8.27,"Adventure, Comedy, Sci-Fi, Shounen",tv,Dr. Stone
2,38790,Itai no wa Iya nanode Bougyoryoku ni Kyokufuri...,After an enthusiastic invitation from her frie...,7.51,"Action, Adventure, Comedy, Fantasy, Video Game",tv,"BOFURI: I Don't Want to Get Hurt, so I'll Max ..."
3,38816,Hello World,"The year is 2027, and the city of Kyoto has un...",7.49,"Drama, Romance, Sci-Fi",movie,
4,35376,Himouto! Umaru-chan R,Umaru Doma is a model student who has a hidden...,7.32,"Comedy, Otaku Culture, School, Seinen",tv,"Himouto! Umaru-chan 2nd Season,My Two-Faced Li..."


---

In [14]:
# Preprocess genres
from sklearn.preprocessing import MultiLabelBinarizer
anime_df['genres_list'] = anime_df['genres'].fillna('').str.split(', ')
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(anime_df['genres_list'])


In [15]:
# Map genres to aligned array
genre_array = np.zeros((len(anime2idx), len(mlb.classes_)))
for _, row in anime_df.iterrows():
    idx = anime2idx.get(row['id'])
    if idx is not None:
        genre_array[idx] = mlb.transform([row['genres_list']])[0]

In [16]:
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Flatten, LayerNormalization, Concatenate, Dot, Add
from tensorflow.keras.models import Model

def build_model_with_genres(num_users, num_items, genre_dim, emb_dim=32):
  with tpu_strategy.scope():
    # Inputs
    user_input = Input(shape=(1,), name='user_input')
    anime_input = Input(shape=(1,), name='anime_input')
    genre_input = Input(shape=(genre_dim,), name='genre_input')

    user_emb = Embedding(num_users, emb_dim, name='user_emb')(user_input)
    anime_emb = Embedding(num_items, emb_dim, name='anime_emb')(anime_input)

    user_bias = Embedding(num_users, 1, name='user_bias')(user_input)
    anime_bias = Embedding(num_items, 1, name='anime_bias')(anime_input)

    dot = Dot(axes=-1)([user_emb, anime_emb])

    # Genre MLP branch
    genre_x = Dense(emb_dim, activation='relu')(genre_input)
    genre_x = Dropout(0.3)(genre_x)

    # Combine anime + genre
    merged_anime = Add()([Flatten()(anime_emb), genre_x])
    x = Concatenate()([Flatten()(user_emb), merged_anime])

    # MLP
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = LayerNormalization()(x)

    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = LayerNormalization()(x)

    mlp_out = Dense(1)(x)
    bias_sum = Add()([Flatten()(user_bias), Flatten()(anime_bias), Flatten()(dot)])
    final_output = Add()([mlp_out, bias_sum])

    model = Model(inputs=[user_input, anime_input, genre_input], outputs=final_output)
    model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(1e-3), metrics=['mae'])

    return model

In [17]:
model = build_model_with_genres(len(user2idx), len(anime2idx), genre_dim=genre_array.shape[1])
model.summary()

In [None]:
train_df.head()

Unnamed: 0,user_id,anime_id,score,score_normalized,user,anime
30270192,93335,6746,8.0,0.8,0,0
97973129,311863,798,8.0,0.8,1,1
65198122,206532,2926,8.0,0.8,2,2
97396238,310028,26243,0.0,0.0,3,3
3492874,6895,34350,7.0,0.7,4,4


In [18]:
def make_dataset_with_genres(df, target_col, user2idx, anime2idx, genre_tensor, batch_size):
    user_ids = df['user_id'].map(user2idx).values.astype(np.int32)
    anime_ids = df['anime_id'].map(anime2idx).values.astype(np.int32)
    ratings = df[target_col].values.astype(np.float32)

    ds = tf.data.Dataset.from_tensor_slices((user_ids, anime_ids, ratings))

    def map_fn(user_id, anime_id, rating):
        genre_vec = tf.gather(genre_tensor, anime_id)
        return {
            'user_input': user_id,         # scalar
            'anime_input': anime_id,       # scalar
            'genre_input': genre_vec       # (num_genres,)
        }, rating                          # scalar

    ds = ds.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds


In [19]:
BATCH_SIZE = 8192
target_col = 'score_normalized'
genre_tensor = tf.constant(genre_array.astype(np.float32))  # move to Tensor
train_ds = make_dataset_with_genres(train_df, target_col, user2idx, anime2idx, genre_tensor, BATCH_SIZE)
val_ds   = make_dataset_with_genres(test_df, target_col, user2idx, anime2idx, genre_tensor, BATCH_SIZE)

In [None]:
# --- TRAIN ---
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.5)
]
EPOCHS = 10

model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=callbacks)

Epoch 1/10
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1739s[0m 160ms/step - loss: 0.0910 - mae: 0.2378 - val_loss: 0.0808 - val_mae: 0.2130 - learning_rate: 0.0010
Epoch 2/10
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1760s[0m 164ms/step - loss: 0.0771 - mae: 0.2085 - val_loss: 0.0785 - val_mae: 0.2071 - learning_rate: 0.0010
Epoch 3/10
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1763s[0m 164ms/step - loss: 0.0728 - mae: 0.2004 - val_loss: 0.0781 - val_mae: 0.2053 - learning_rate: 0.0010
Epoch 4/10
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1736s[0m 162ms/step - loss: 0.0710 - mae: 0.1970 - val_loss: 0.0781 - val_mae: 0.2045 - learning_rate: 0.0010
Epoch 5/10
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1710s[0m 159ms/step - loss: 0.0699 - mae: 0.1951 - val_loss: 0.0782 - val_mae: 0.2044 - learning_rate: 0.0010
Epoch 6/10
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1

<keras.src.callbacks.history.History at 0x7ec198e2fed0>

In [None]:
# --- SAVE ---
model.save(f"{save_dir}/anime_rec_model_v6.keras")

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def evaluate_genre_model(model, test_df, user2idx, anime2idx, genre_tensor, normalize=False, train_min=None, train_max=None):
    # Filter valid rows
    valid = test_df[test_df['user_id'].isin(user2idx) & test_df['anime_id'].isin(anime2idx)]
    if valid.empty:
        print("No valid samples for evaluation.")
        return None

    # Prepare input arrays
    user_ids = valid['user_id'].map(user2idx).values.astype(np.int32)
    anime_ids = valid['anime_id'].map(anime2idx).values.astype(np.int32)
    y_true = valid['score'].values.astype(np.float32)

    # Get genres from tensor
    genre_vectors = tf.gather(genre_tensor, anime_ids).numpy()

    # Build input dict
    inputs = {
        'user_input': user_ids,
        'anime_input': anime_ids,
        'genre_input': genre_vectors
    }

    # Predict
    y_pred = model.predict(inputs, verbose=0).flatten()

    # Rescale if needed
    if normalize and train_min is not None and train_max is not None:
        y_pred = y_pred * (train_max - train_min) + train_min

    # Evaluate
    return {
        'RMSE': round(np.sqrt(mean_squared_error(y_true, y_pred)), 4),
        'MAE': round(mean_absolute_error(y_true, y_pred), 4),
        'R2': round(r2_score(y_true, y_pred), 4)
    }


In [None]:
results = evaluate_genre_model(
    model=model,
    test_df=test_df,
    user2idx=user2idx,
    anime2idx=anime2idx,
    genre_tensor=genre_tensor,
    normalize=True,
    train_min=train_min,
    train_max=train_max
)

print("Evaluation:", results)

Evaluation: {'RMSE': np.float64(2.7927), 'MAE': 2.0191, 'R2': 0.4896}


In [27]:
import os
import tensorflow as tf
# Connect google drive
from google.colab import drive
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
drive.mount('/content/drive')

# Path to google drive
INPUT_DIR = '/content/drive/MyDrive'

# Define the save path
save_dir = f"{INPUT_DIR}/saved_model"
try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
    print("TPU detected:", resolver.master())
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    tpu_strategy = tf.distribute.TPUStrategy(resolver)
    print("Replicas:", tpu_strategy.num_replicas_in_sync)
except ValueError as e:
    print("TPU initialization failed:", e)




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
TPU detected: 
Replicas: 1


In [24]:
def recommend_anime_for_user_previous_model(user_id, top_k=10, model_path=f'{save_dir}/anime_rec_model_v6.keras', data_dir=save_dir):
    """
    Recommend Top-K anime for a given user based on the genre-aware rating prediction model.

    Parameters:
    - user_id: ID of the user to recommend for
    - top_k: number of recommendations to return
    - model_path: path to the trained model (Keras .keras file)
    - data_dir: directory containing saved mappings and genre data

    Returns:
    - DataFrame of recommended anime (anime_id, title)
    """


    # Load the trained model
    with tpu_strategy.scope():

        model = tf.keras.models.load_model(model_path, compile=False)

    # Load user2idx and anime2idx mappings
    with open(f"{data_dir}/user2idx_v2.pkl", "rb") as f:
        user2idx = pickle.load(f)
    with open(f"{data_dir}/anime2idx_v2.pkl", "rb") as f:
        anime2idx = pickle.load(f)
    anime_df = pd.read_parquet(f"{data_dir}/anime_metadata.parquet")

    # Load genre matrix (assuming it's stored as npy file)
    genre_tensor = np.load(f"{data_dir}/genre_tensor.npy").astype(np.float32)

    # Check if user_id is known
    if user_id not in user2idx:
        print(f"User {user_id} not in dataset.")
        return []

    user_idx = user2idx[user_id]
    all_anime_indices = np.arange(len(anime2idx))

    # Prepare inputs
    user_input = np.full_like(all_anime_indices, user_idx)
    genre_input = genre_tensor

    inputs = {
        "user_input": user_input,
        "anime_input": all_anime_indices,
        "genre_input": genre_input,
    }

    # Predict scores
    scores = model.predict(inputs, batch_size=4096, verbose=0).flatten()

    # Filter out items already seen by user
    try:
        train_df = pd.read_parquet(f"{data_dir}/train_ratings.parquet")
        seen_anime = set(train_df.loc[train_df['user_id'] == user_id, 'anime_id'])
        seen_indices = [anime2idx[aid] for aid in seen_anime if aid in anime2idx]
        scores[seen_indices] = -np.inf
    except Exception as e:
        print("Warning: Could not filter seen items:", e)

    # Get top-K indices
    top_indices = np.argpartition(-scores, top_k)[:top_k]
    top_indices = top_indices[np.argsort(-scores[top_indices])]

    # Map back to anime IDs and titles
    idx2anime = {v: k for k, v in anime2idx.items()}
    recommended_ids = [idx2anime[idx] for idx in top_indices]
    recommended_titles = anime_df.loc[anime_df['id'].isin(recommended_ids), ['id', 'english_title', 'title']]

    return recommended_titles.reset_index(drop=True)


In [28]:
recommendations = recommend_anime_for_user_previous_model(user_id=361908, top_k=20)
recommendations.head(20)

Unnamed: 0,id,english_title,title
0,21,One Piece,One Piece
1,4282,The Garden of Sinners Chapter 5: Paradox Spiral,Kara no Kyoukai Movie 5: Mujun Rasen
2,33050,Fate/stay night: Heaven's Feel - III. Spring Song,Fate/stay night Movie: Heaven's Feel - III. Sp...
3,40456,Demon Slayer: Kimetsu no Yaiba - The Movie: Mu...,Kimetsu no Yaiba Movie: Mugen Ressha-hen
4,19,Monster,Monster
5,9253,Steins;Gate,Steins;Gate
6,13125,From the New World,Shinsekai yori
7,820,Legend of the Galactic Heroes,Ginga Eiyuu Densetsu
8,39065,We're Still Underground,Bokura Mada Underground
9,13755,"Shan shui qing,Feeling from Mountain and Water...",Shanshui Qing


---

In [None]:
import os
# Connect google drive
from google.colab import drive
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
drive.mount('/content/drive')

# Path to google drive
INPUT_DIR = '/content/drive/MyDrive'

# Define the save path
save_dir = f"{INPUT_DIR}/saved_model"
try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
    print("TPU detected:", resolver.master())
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    tpu_strategy = tf.distribute.TPUStrategy(resolver)
    print("Replicas:", tpu_strategy.num_replicas_in_sync)
except ValueError as e:
    print("TPU initialization failed:", e)

 This a JAX bug; please report an issue at https://github.com/jax-ml/jax/issues
  _warn(f"cloud_tpu_init failed: {exc!r}\n This a JAX bug; please report "


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
TPU detected: 
Replicas: 1


In [None]:
def recommend_similar_anime(anime_id, top_n=10, model_path=f'{save_dir}/anime_rec_model_v6.keras', data_dir=save_dir):




    # === Load required data ===
    anime_df = pd.read_parquet(f"{data_dir}/anime_metadata.parquet")
    with open(f"{data_dir}/anime2idx_v2.pkl", "rb") as f:
      anime2idx = pickle.load(f)
    idx2anime = {v: k for k, v in anime2idx.items()}
    genre_tensor = np.load(f"{data_dir}/genre_tensor.npy").astype(np.float32)

    # === Prepare index lookups ===
    if anime_id not in anime2idx:
        raise ValueError(f"anime_id {anime_id} not found in anime2idx mapping.")

    target_idx = anime2idx[anime_id]


    # === Load model ===

    if tpu_strategy:
        with tpu_strategy.scope():
            model = tf.keras.models.load_model(model_path, compile=False)
    else:
        print("Loading model without TPU strategy...")
        model = tf.keras.models.load_model(model_path, compile=False)

    # === Build inputs for all anime ===
    num_anime = len(anime2idx)
    anime_indices = np.arange(num_anime, dtype=np.int32)

    user_input = np.full(shape=(num_anime,), fill_value=0, dtype=np.int32)  # dummy user
    anime_input = anime_indices
    genre_input = genre_tensor.astype(np.float32)

    # === Target anime genre vector ===
    target_genre = genre_tensor[target_idx].astype(np.float32)

    # === Predict scores ===
    inputs = {
        "user_input": user_input,
        "anime_input": anime_input,
        "genre_input": genre_input
    }
    predictions = model.predict(inputs, batch_size=1024, verbose=0).flatten()

    # === Compute cosine similarity (optional enhancement) ===
    # Or just use model predictions
    similarity_scores = predictions

    # === Exclude self ===
    similarity_scores[target_idx] = -np.inf

    # === Get top-N ===
    top_indices = np.argsort(similarity_scores)[-top_n:][::-1]
    top_anime_ids = [idx2anime[idx] for idx in top_indices]
    top_anime_names = anime_df[anime_df["id"].isin(top_anime_ids)][['id', 'english_title', 'title']]

    return top_anime_names.reset_index(drop=True)

In [None]:
recommend_similar_anime(anime_id=5114, top_n=5) # Fullmetal alchemist)

Unnamed: 0,id,english_title,title
0,2904,Code Geass: Lelouch of the Rebellion R2,Code Geass: Hangyaku no Lelouch R2
1,11757,Sword Art Online,Sword Art Online
2,51632,In Another World With My Smartphone 2,Isekai wa Smartphone to Tomo ni. 2
3,10694,Unknown,Bloods: Inraku no Ketsuzoku 2
4,12375,Unknown,Maki-chan to Now.
