# Creating and training model

### If you want to train model with your user run cells up-to underline.

In [None]:
import numpy as np
import pandas as pd

In [None]:
INPUT_DIR = 'PATH_TO_INPUT_DIRECTORY'  # Replace with your input directory path

# Load ratings data
ratings_df = pd.read_csv(
    f'{INPUT_DIR}/animelist.csv',
    usecols=['user_id', 'anime_id', 'score'],
    dtype={'user_id': 'int32', 'anime_id': 'int32', 'score': 'float32'}
)

# Load anime details
anime_df = pd.read_csv(
    f'{INPUT_DIR}/anime_details.csv',
    usecols=['id', 'title', 'synopsis', 'genres', 'mean', 'media_type', 'english_title'],
    dtype={'id': 'int32', 'mean': 'float32'}
)

print("Ratings columns:", ratings_df.columns.tolist())
print("Anime columns:", anime_df.columns.tolist())

In [None]:
user_r = pd.read_csv(f'{INPUT_DIR}/your_animelist.csv',
                     usecols=['user_id', 'anime_id', 'score'],
                     dtype={'user_id': 'string', 'anime_id': 'int32', 'score': 'float32'})
user_r.head()

mp = {
    'your_nickname': ratings_df['user_id'].max() + 1 # Replace 'your_nickname' with the actual nickname
    # Add more mappings if needed
}
user_r['user_id'] = user_r['user_id'].map(mp)

bfr_shape = ratings_df.shape[0]
ratings_df = pd.concat([ratings_df, user_r], ignore_index=True)
aftr_shape = ratings_df.shape[0]
print(f'Before: {bfr_shape}, After: {aftr_shape}, Diff: {aftr_shape - bfr_shape}')

In [None]:
# Filter users/animes with sufficient interactions if needed 
MIN_RATINGS_PER_USER = 40
MIN_RATINGS_PER_ANIME = 10

# Filter users
user_counts = ratings_df['user_id'].value_counts()
ratings_df = ratings_df[ratings_df['user_id'].isin(user_counts[user_counts >= MIN_RATINGS_PER_USER].index)]

# Filter animes
anime_counts = ratings_df['anime_id'].value_counts()
ratings_df = ratings_df[ratings_df['anime_id'].isin(anime_counts[anime_counts >= MIN_RATINGS_PER_ANIME].index)]

print("Filtered ratings shape:", ratings_df.shape)

In [None]:
from sklearn.model_selection import train_test_split

# Split first to prevent data leakage
train_df, test_df = train_test_split(
    ratings_df,
    test_size=0.2,
    random_state=42,
    stratify=ratings_df['user_id']  # Maintain user distribution
)

print("Train size:", len(train_df))
print("Test size:", len(test_df))

In [None]:
# Normalize scores to [0, 1] range
# Calculate min/max from TRAINING set only
train_min = train_df['score'].min()
train_max = train_df['score'].max()

# Scale ratings to [0, 1]
train_df['score_normalized'] = (train_df['score'] - train_min) / (train_max - train_min)
test_df['score_normalized'] = (test_df['score'] - train_min) / (train_max - train_min)

print("Train min/max:", train_min, train_max)
print("Scaled train sample:", train_df['score_normalized'].head(3))

In [None]:
# Create mappings
user_ids = train_df['user_id'].unique()
anime_ids = train_df['anime_id'].unique()

user2idx = {user: idx for idx, user in enumerate(user_ids)}
anime2idx = {anime: idx for idx, anime in enumerate(anime_ids)}

# Apply encoding
train_df['user'] = train_df['user_id'].map(user2idx)
train_df['anime'] = train_df['anime_id'].map(anime2idx)
test_df = test_df[test_df['user_id'].isin(user2idx.keys()) & test_df['anime_id'].isin(anime2idx.keys())]  # Filter unseen users/animes
test_df['user'] = test_df['user_id'].map(user2idx)
test_df['anime'] = test_df['anime_id'].map(anime2idx)

print("Unique users:", len(user2idx))
print("Unique animes:", len(anime2idx))

In [None]:
!pip install tensorflow==2.18.0  # Version known to work with Colab TPUs
!pip install tensorflow-tpu==2.18.0 --find-links=https://storage.googleapis.com/libtpu-tf-releases/index.html

In [None]:
import tensorflow as tf

try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
    print("TPU detected:", resolver.master())
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)
    print("Num TPUs:", len(tf.config.list_logical_devices('TPU')))
except ValueError as e:
    print("TPU initialization failed:", e)

In [None]:
# Preprocess genres
from sklearn.preprocessing import MultiLabelBinarizer
anime_df['genres_list'] = anime_df['genres'].fillna('').str.split(', ')
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(anime_df['genres_list'])

In [None]:
# Map genres to aligned array
genre_array = np.zeros((len(anime2idx), len(mlb.classes_)))
for _, row in anime_df.iterrows():
    idx = anime2idx.get(row['id'])
    if idx is not None:
        genre_array[idx] = mlb.transform([row['genres_list']])[0]

In [None]:
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Flatten, LayerNormalization, Concatenate, Dot, Add
from tensorflow.keras.models import Model

def build_model_with_genres(num_users, num_items, genre_dim, emb_dim=32):
    with strategy.scope():
        # Inputs
        user_input = Input(shape=(1,), name='user_input')
        anime_input = Input(shape=(1,), name='anime_input')
        genre_input = Input(shape=(genre_dim,), name='genre_input')

        user_emb = Embedding(num_users, emb_dim, name='user_emb')(user_input)
        anime_emb = Embedding(num_items, emb_dim, name='anime_emb')(anime_input)

        user_bias = Embedding(num_users, 1, name='user_bias')(user_input)
        anime_bias = Embedding(num_items, 1, name='anime_bias')(anime_input)

        dot = Dot(axes=-1)([user_emb, anime_emb])

        # Genre MLP branch
        genre_x = Dense(emb_dim, activation='relu')(genre_input)
        genre_x = Dropout(0.3)(genre_x)

        # Combine anime + genre
        merged_anime = Add()([Flatten()(anime_emb), genre_x])
        x = Concatenate()([Flatten()(user_emb), merged_anime])

        # MLP
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)

        x = Dense(64, activation='relu')(x)
        x = Dropout(0.2)(x)
        x = LayerNormalization()(x)

        mlp_out = Dense(1)(x)
        bias_sum = Add()([Flatten()(user_bias), Flatten()(anime_bias), Flatten()(dot)])
        final_output = Add()([mlp_out, bias_sum])

        model = Model(inputs=[user_input, anime_input, genre_input], outputs=final_output)
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(1e-3), metrics=['mae'])

        return model

In [None]:
model = build_model_with_genres(len(user2idx), len(anime2idx), genre_dim=genre_array.shape[1])
model.summary()

In [None]:
def make_dataset_with_genres(df, target_col, user2idx, anime2idx, genre_tensor, batch_size):
    user_ids = df['user_id'].map(user2idx).values.astype(np.int32)
    anime_ids = df['anime_id'].map(anime2idx).values.astype(np.int32)
    ratings = df[target_col].values.astype(np.float32)

    ds = tf.data.Dataset.from_tensor_slices((user_ids, anime_ids, ratings))

    def map_fn(user_id, anime_id, rating):
        genre_vec = tf.gather(genre_tensor, anime_id)
        return {
            'user_input': user_id,         # scalar
            'anime_input': anime_id,       # scalar
            'genre_input': genre_vec       # (num_genres,)
        }, rating                          # scalar

    ds = ds.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

In [None]:
BATCH_SIZE = 8192
target_col = 'score_normalized'
genre_tensor = tf.constant(genre_array.astype(np.float32))  # move to Tensor
train_ds = make_dataset_with_genres(train_df, target_col, user2idx, anime2idx, genre_tensor, BATCH_SIZE)
val_ds   = make_dataset_with_genres(test_df, target_col, user2idx, anime2idx, genre_tensor, BATCH_SIZE)

In [None]:
# --- TRAIN ---
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.5)
]
EPOCHS = 10

model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=callbacks)

In [None]:
# --- SAVE ---
save_dir = 'PATH_TO_SAVE_DIRECTORY'  # Replace with your save directory path
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
model.save(f"{save_dir}/anime_rec_model_v.keras")

import pickle 
# Save mappings
with open(f"{save_dir}/user2idx.pkl", 'wb') as f:
    pickle.dump(user2idx, f)
with open(f"{save_dir}/anime2idx.pkl", 'wb') as f:
    pickle.dump(anime2idx, f)
    
# Save genre mapping
np.save(f"{save_dir}/genre_array.npy", genre_tensor.numpy())

# Save metadata
anime_df.to_parquet(f"{INPUT_DIR}/saved_model/anime_metadata.parquet")

# Save ratings
ratings_df.to_parquet(f"{INPUT_DIR}/saved_model/train_ratings.parquet")

---

# If you want to use already trained model run those cells.

**There should be already saved files:**
- model.keras model that was downloaded from link or saved from previus step
- user2idx.pkl and anime2idx.pkl mappings of our data
- genre_tensor.npy genre matrix
- anime_metadata.parquet anime metadata for titles
- train_raiting.parquet ratings that used train model `optional`




In [None]:
import os
import tensorflow as tf
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle

# Connect google drive
from google.colab import drive
drive.mount('/content/drive')

# Path to google drive
INPUT_DIR = '/content/drive/MyDrive'

# Define the save path
save_dir = f"{INPUT_DIR}/saved_model"
try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
    print("TPU detected:", resolver.master())
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    tpu_strategy = tf.distribute.TPUStrategy(resolver)
    print("Replicas:", tpu_strategy.num_replicas_in_sync)
except ValueError as e:
    print("TPU initialization failed:", e)


In [None]:
def recommend_anime_for_user_previous_model(user_id, top_k=10, model_path=f'{save_dir}/anime_rec_model_v6.keras', data_dir=save_dir):
    """
    Recommend Top-K anime for a given user based on the genre-aware rating prediction model.

    Parameters:
    - user_id: ID of the user to recommend for
    - top_k: number of recommendations to return
    - model_path: path to the trained model (Keras .keras file)
    - data_dir: directory containing saved mappings and genre data

    Returns:
    - DataFrame of recommended anime (anime_id, title)
    """


    # Load the trained model
    with tpu_strategy.scope():

        model = tf.keras.models.load_model(model_path, compile=False)

    # Load user2idx and anime2idx mappings
    with open(f"{data_dir}/user2idx.pkl", "rb") as f:
        user2idx = pickle.load(f)
    with open(f"{data_dir}/anime2idx.pkl", "rb") as f:
        anime2idx = pickle.load(f)
    anime_df = pd.read_parquet(f"{data_dir}/anime_metadata.parquet")

    # Load genre matrix 
    genre_tensor = np.load(f"{data_dir}/genre_tensor.npy").astype(np.float32)

    # Check if user_id is known
    if user_id not in user2idx:
        print(f"User {user_id} not in dataset.")
        return []

    user_idx = user2idx[user_id]
    all_anime_indices = np.arange(len(anime2idx))

    # Prepare inputs
    user_input = np.full_like(all_anime_indices, user_idx)
    genre_input = genre_tensor

    inputs = {
        "user_input": user_input,
        "anime_input": all_anime_indices,
        "genre_input": genre_input,
    }

    # Predict scores
    scores = model.predict(inputs, batch_size=4096, verbose=0).flatten()

    # Filter out items already seen by user
    try:
        train_df = pd.read_parquet(f"{data_dir}/train_ratings.parquet")
        seen_anime = set(train_df.loc[train_df['user_id'] == user_id, 'anime_id'])
        seen_indices = [anime2idx[aid] for aid in seen_anime if aid in anime2idx]
        scores[seen_indices] = -np.inf
    except Exception as e:
        print("Warning: Could not filter seen items:", e)

    # Get top-K indices
    top_indices = np.argpartition(-scores, top_k)[:top_k]
    top_indices = top_indices[np.argsort(-scores[top_indices])]

    # Map back to anime IDs and titles
    idx2anime = {v: k for k, v in anime2idx.items()}
    recommended_ids = [idx2anime[idx] for idx in top_indices]
    recommended_titles = anime_df.loc[anime_df['id'].isin(recommended_ids), ['id', 'english_title', 'title']]

    return recommended_titles.reset_index(drop=True)


In [None]:
recommendations = recommend_anime_for_user_previous_model(user_id='id from data', top_k=20)
recommendations.head(20)