<a href="https://colab.research.google.com/github/xular13/model_rs/blob/main/rs_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q tensorflow
!pip install -q scikit-learn pandas numpy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
print("GPU Available:", tf.config.list_physical_devices('GPU'))
tf.config.optimizer.set_jit(True)  # Enable XLA for faster execution

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:


INPUT_DIR = '/content/drive/MyDrive'

In [None]:
anime_details = pd.read_csv(f'{INPUT_DIR}/anime_details_v2.csv',
                           usecols=['id', 'genres', 'english_title', 'synopsis', 'title'],
                           dtype={'id': 'int32'})

In [None]:
ratings = pd.read_csv(f'{INPUT_DIR}/animelist_v2.csv',
                     usecols=['user_id', 'anime_id', 'score'],
                     dtype={'user_id': 'int32', 'anime_id': 'int32', 'score': 'float32'})



In [None]:
user_r = pd.read_csv(f'{INPUT_DIR}/user_data.csv',
                     usecols=['user_id', 'anime_id', 'score'],
                     dtype={'user_id': 'string', 'anime_id': 'int32', 'score': 'float32'})
user_r.head()

Unnamed: 0,user_id,anime_id,score
0,xular13,32281,10.0
1,xular13,41457,10.0
2,xular13,47194,10.0
3,xular13,48569,10.0
4,xular13,49387,10.0


In [None]:
mp = {
    'xular13': ratings['user_id'].max() + 1
}
user_r['user_id'] = user_r['user_id'].map(mp)

In [None]:
user_r.head()

Unnamed: 0,user_id,anime_id,score
0,361908,32281,10.0
1,361908,41457,10.0
2,361908,47194,10.0
3,361908,48569,10.0
4,361908,49387,10.0


In [None]:
all_users = ratings['user_id'].unique()
warm_users, cold_users = train_test_split(
    all_users,
    test_size=0.1,  # 10% cold-start users
    random_state=42
)

In [None]:
warm_users = np.append(warm_users, 361908)

In [None]:
# Preprocess genres
anime_df = anime_details.copy()
anime_df['genres'] = anime_df['genres'].fillna('').str.split(', ')
mlb = MultiLabelBinarizer()
genre_vectors = mlb.fit_transform(anime_df['genres'])
anime_df = pd.concat([anime_df, pd.DataFrame(genre_vectors, columns=mlb.classes_)], axis=1)


In [None]:
bfr_shape = ratings.shape[0]
ratings = pd.concat([ratings, user_r], ignore_index=True)
aftr_shape = ratings.shape[0]
print(f'Before: {bfr_shape}, After: {aftr_shape}, Diff: {aftr_shape - bfr_shape}')

Before: 110685120, After: 110685201, Diff: 81


In [None]:
ratings[ratings['user_id'] == 361908]

Unnamed: 0,user_id,anime_id,score
110685120,361908,32281,10.0
110685121,361908,41457,10.0
110685122,361908,47194,10.0
110685123,361908,48569,10.0
110685124,361908,49387,10.0
...,...,...,...
110685196,361908,52034,7.0
110685197,361908,52198,7.0
110685198,361908,38000,6.0
110685199,361908,40052,6.0


In [None]:
# %% [Step 3] Filter Interactions
MIN_RATINGS_PER_USER = 40
MIN_RATINGS_PER_ANIME = 40

In [None]:
# Filter users
user_counts = ratings['user_id'].value_counts()
valid_users = user_counts[user_counts >= MIN_RATINGS_PER_USER].index

ratings = ratings[ratings['user_id'].isin(valid_users)]

In [None]:
# Filter animes
anime_counts = ratings['anime_id'].value_counts()
valid_animes = anime_counts[anime_counts >= MIN_RATINGS_PER_ANIME].index
ratings = ratings[ratings['anime_id'].isin(valid_animes)]

In [None]:
# Split warm users' interactions
warm_ratings = ratings[ratings['user_id'].isin(warm_users)]
train_val, warm_test = train_test_split(
    warm_ratings,
    test_size=0.2,
    random_state=42,
    stratify=warm_ratings['user_id']
)

# Further split train_val into train/val
train, val = train_test_split(
    train_val,
    test_size=0.25,
    random_state=42,
    stratify=train_val['user_id']
)

In [None]:
# Cold-start test set
# cold_test = ratings[ratings['user_id'].isin(cold_users)]
pd.to_csv(f'{INPUT_DIR}/cold_test.csv', index=False)

In [None]:
# %% [Step 4] Encoding & Normalization
# Create encoders from training data only
user2idx = {u: i for i, u in enumerate(train['user_id'].unique())}
anime2idx = {a: i for i, a in enumerate(train['anime_id'].unique())}

In [None]:
def safe_encode(df):
    df = df.copy()
    df['user'] = df['user_id'].map(user2idx)
    df['anime'] = df['anime_id'].map(anime2idx)
    return df.dropna(subset=['user', 'anime'])

In [None]:
train_enc = safe_encode(train)
val_enc = safe_encode(val)
warm_test_enc = safe_encode(warm_test)

In [None]:
# Normalization
train_min = train_enc['score'].min()
train_max = train_enc['score'].max()

for df in [train_enc, val_enc, warm_test_enc]:
    df['score_norm'] = (df['score'] - train_min) / (train_max - train_min)


In [None]:
# %% [Step 6] Prepare Genre Matrix
genre_matrix = np.zeros((len(anime2idx), len(mlb.classes_)), dtype=np.float32)
for aid, idx in anime2idx.items():
    genre_matrix[idx] = anime_df[anime_df['id'] == aid][mlb.classes_].values[0]

In [None]:
import pickle
artifacts = {
    'user2idx': user2idx,
    'anime2idx': anime2idx,
    'train_min': train_min,
    'train_max': train_max,
    'mlb' : mlb
}
with open(f'{INPUT_DIR}/artifacts_v2.pkl', 'wb') as f:
    pickle.dump(artifacts, f)

np.save(f'{INPUT_DIR}/genre_matrix.npy', genre_matrix)

In [None]:
# %% [Step 7] Build GPU-Optimized Model
def build_gpu_model():
    user_in = tf.keras.Input(shape=(1,), name='user', dtype='int32')
    anime_in = tf.keras.Input(shape=(1,), name='anime', dtype='int32')
    genre_in = tf.keras.Input(shape=(len(mlb.classes_),), name='genres', dtype='float32')

    # Embeddings
    u_emb = tf.keras.layers.Embedding(len(user2idx), 128)(user_in)
    a_emb = tf.keras.layers.Embedding(len(anime2idx), 128)(anime_in)

    # Concatenate with genre features
    merged = tf.keras.layers.Concatenate()([
        tf.keras.layers.Flatten()(u_emb),
        tf.keras.layers.Flatten()(a_emb),
        genre_in
    ])

    # Deep layers
    x = tf.keras.layers.Dense(512, activation='relu', kernel_initializer='he_normal')(merged)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    # Output
    output = tf.keras.layers.Dense(1, activation='linear')(x)

    return tf.keras.Model(inputs=[user_in, anime_in, genre_in], outputs=output)

In [None]:
# Enable mixed precision
tf.keras.mixed_precision.set_global_policy('mixed_float16')

model = build_gpu_model()
model.compile(
    loss='mse',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, clipvalue=1.0),
    metrics=['mae']
)
model.summary()

In [None]:
def save_as_memmap(df, path):
    np.save(f'{path}_users.npy', df['user'].values.astype('int32'))
    np.save(f'{path}_animes.npy', df['anime'].values.astype('int32'))
    np.save(f'{path}_scores.npy', ((df['score'] - train_min) / (train_max - train_min)).values.astype('float32'))

# Save genre matrix
np.save(f'{INPUT_DIR}/genre_matrix.npy', genre_matrix)

# Save splits
save_as_memmap(train_enc, f'{INPUT_DIR}/train')
save_as_memmap(val_enc, f'{INPUT_DIR}/val')
save_as_memmap(warm_test_enc, f'{INPUT_DIR}/warm_test')

In [None]:
def create_memory_efficient_dataset(df_path, genre_matrix_path, batch_size=2048):
    # Load memory-mapped arrays
    users = np.load(f'{df_path}_users.npy', mmap_mode='r')
    animes = np.load(f'{df_path}_animes.npy', mmap_mode='r')
    scores = np.load(f'{df_path}_scores.npy', mmap_mode='r')
    genre_matrix = np.load(genre_matrix_path, mmap_mode='r')

    def generator():
        for i in range(0, len(users), batch_size):
            batch_users = users[i:i+batch_size]
            batch_animes = animes[i:i+batch_size]
            batch_genres = genre_matrix[batch_animes]
            batch_scores = scores[i:i+batch_size]
            yield ({'user': batch_users, 'anime': batch_animes, 'genres': batch_genres}, batch_scores)

    return tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            {
                'user': tf.TensorSpec(shape=(None,), dtype=tf.int32),
                'anime': tf.TensorSpec(shape=(None,), dtype=tf.int32),
                'genres': tf.TensorSpec(shape=(None, len(mlb.classes_)), dtype=tf.float32)
            },
            tf.TensorSpec(shape=(None,), dtype=tf.float32))
    ).prefetch(tf.data.AUTOTUNE)

In [None]:
train_ds = create_memory_efficient_dataset(
    f'{INPUT_DIR}/train',
    f'{INPUT_DIR}/genre_matrix.npy',
    batch_size=2048
)

val_ds = create_memory_efficient_dataset(
    f'{INPUT_DIR}/val',
    f'{INPUT_DIR}/genre_matrix.npy',
    batch_size=2048
)

In [None]:
# %% [Step 9] Train Model
history = model.fit(
    train_ds,
    validation_data=val_ds,
    steps_per_epoch=len(train_enc) // 2048,
    validation_steps=len(val_enc) // 2048,
    epochs=20,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, monitor='val_loss'),
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
    ]
)



Epoch 1/20
[1m29035/29035[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 5ms/step - loss: 1.7483 - mae: 1.0393 - val_loss: 0.9780 - val_mae: 0.7835 - learning_rate: 0.0010
Epoch 2/20
[1m    1/29035[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m21:09:48[0m 3s/step - loss: 1.7838 - mae: 1.0457



[1m29035/29035[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 734us/step - loss: 1.7838 - mae: 1.0457 - val_loss: 0.9781 - val_mae: 0.7835 - learning_rate: 0.0010
Epoch 3/20
[1m29035/29035[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 5ms/step - loss: 1.7474 - mae: 1.0391 - val_loss: 0.9780 - val_mae: 0.7835 - learning_rate: 0.0010
Epoch 4/20
[1m29035/29035[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 724us/step - loss: 1.7148 - mae: 1.0408 - val_loss: 0.9781 - val_mae: 0.7835 - learning_rate: 5.0000e-04


ValueError: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=/content/drive/MyDrive/gpu_model.

In [None]:
# %% [Step 10] Save Model
model.save(f'{INPUT_DIR}/saved_model/gpu_model.keras')

In [None]:
import pickle
with open(f'{INPUT_DIR}/artifacts_v2.pkl', 'rb') as f:
    artifacts = pickle.load(f)

user2idx = artifacts['user2idx']
anime2idx = artifacts['anime2idx']
mlb = artifacts['mlb']
train_min = artifacts['train_min']
train_max = artifacts['train_max']

# Load genre matrix
genre_matrix = np.load(f'{INPUT_DIR}/genre_matrix.npy')

# Load model
model = tf.keras.models.load_model(f'{INPUT_DIR}/saved_model/gpu_model.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import mean_absolute_error

In [None]:
# %% [Step 3] Warm User Evaluation
def evaluate_warm(test_path):
    # Memory-efficient data loading
    users = np.load(f'{test_path}_users.npy', mmap_mode='r')
    animes = np.load(f'{test_path}_animes.npy', mmap_mode='r')
    scores = np.load(f'{test_path}_scores.npy', mmap_mode='r')

    # Predict in batches
    batch_size = 2048
    preds = []
    for i in range(0, len(users), batch_size):
        batch = {
            'user': users[i:i+batch_size],
            'anime': animes[i:i+batch_size],
            'genres': genre_matrix[animes[i:i+batch_size]]
        }
        preds.extend(model.predict(batch, verbose=0).flatten())

    # Denormalize
    preds = np.array(preds) * (train_max - train_min) + train_min
    true = scores * (train_max - train_min) + train_min

    return {
        'mae': mean_absolute_error(true, preds),
        'samples': len(true)
    }


In [None]:
# %% [Step 4] Cold-Start Evaluation
def evaluate_cold(cold_data_path):
    # Load cold-start data
    cold_df = pd.read_csv(cold_data_path)
    results = []

    # Process each cold-start user
    for user_id, group in cold_df.groupby('user_id'):
        try:
            # Get rated anime indices
            anime_indices = [anime2idx[a] for a in group['anime_id'] if a in anime2idx]

            if not anime_indices:
                continue

            # Create pseudo-inputs
            dummy_user = np.zeros(len(anime_indices), dtype=np.int32)
            genres = genre_matrix[anime_indices]

            # Predict
            preds = model.predict({
                'user': dummy_user,
                'anime': np.array(anime_indices),
                'genres': genres
            }, verbose=0).flatten()

            # Denormalize
            preds = preds * (train_max - train_min) + train_min
            true = group['score'].values[:len(preds)]
            results.extend(np.abs(preds - true))

        except KeyError:
            continue

    return {
        'mae': np.mean(results) if results else None,
        'samples': len(results)
    }

In [None]:
# %% [Step 5] Run Evaluations
print("Evaluating Warm Users:")
warm_metrics = evaluate_warm(f'{INPUT_DIR}/warm_test')
print(f"Warm MAE: {warm_metrics['mae']:.3f} (n={warm_metrics['samples']})")

Evaluating Warm Users:


In [None]:
model = tf.keras.models.load_model(f'{INPUT_DIR}/saved_model/gpu_model.keras')


  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
def generate_recommendations(user_ratings_df, anime_metadata_df, model, n=10):
    """Final fixed version with proper type handling and error checking"""
    # Load artifacts
    import pickle
    MODEL_DIR = '/content/drive/MyDrive'
    with open(f'{MODEL_DIR}/artifacts_v2.pkl', 'rb') as f:
        artifacts = pickle.load(f)

    user2idx = artifacts['user2idx']
    anime2idx = artifacts['anime2idx']
    train_min = artifacts['train_min']
    train_max = artifacts['train_max']
    genre_matrix = np.load(f'{MODEL_DIR}/genre_matrix.npy').astype(np.float32)

    results = []

    for user_id, group in user_ratings_df.groupby('user_id'):
        try:
            # Get valid candidate anime
            rated_anime = group['anime_id'].unique()
            candidate_ids = [
                aid for aid in anime2idx.keys()
                if aid not in rated_anime and aid in anime_metadata_df['id'].values
            ]

            if not candidate_ids:
                continue

            # Prepare inputs with strict type checking
            user_idx = user2idx.get(user_id, 0)
            anime_indices = [anime2idx[aid] for aid in candidate_ids]

            # Ensure valid indices
            valid_indices = [idx for idx in anime_indices if idx < genre_matrix.shape[0]]
            if not valid_indices:
                continue


            inputs = {
                'user': np.full(len(anime_indices), user_idx, dtype=np.int32).reshape(-1, 1),
                'anime': np.array(anime_indices, dtype=np.int32).reshape(-1, 1),
                'genres': genre_matrix[anime_indices].astype(np.float32)
            }

            # Predict and denormalize
            preds = model.predict(inputs, verbose=0).flatten()
            preds = np.clip(preds, 0., 1.)
            preds_denorm = preds * (train_max - train_min) + train_min

            # Create results DataFrame
            rec_df = pd.DataFrame({
                'user_id': user_id,
                'anime_id': candidate_ids,
                'predicted_score': preds_denorm.astype(np.float64)  # Ensure proper dtype
            })

            # Handle empty/non-numeric scores
            if rec_df.empty or not np.issubdtype(rec_df['predicted_score'].dtype, np.number):
                continue

            # Get top N recommendations
            rec_df = rec_df.nlargest(n, 'predicted_score', keep='first')

            # Merge metadata safely
            rec_df = rec_df.merge(
                anime_metadata_df[['id', 'title', 'genres', 'synopsis']],
                left_on='anime_id',
                right_on='id',
                how='left'
            ).drop(columns=['id'])

            results.append(rec_df)

        except Exception as e:
            print(f"Skipping user {user_id} due to error: {str(e)}")
            continue

    return pd.concat(results, ignore_index=True) if results else pd.DataFrame()

In [None]:
df = generate_recommendations(user_r, anime_details, model, n=100)

In [None]:
# prompt: sort df with predicted score

df_sorted = df.sort_values(by='predicted_score', ascending=False)
df_sorted.tail(20)


Unnamed: 0,user_id,anime_id,predicted_score,title,genres,synopsis
80,361908,8675,10.0,Seitokai Yakuindomo,"Comedy, Gag Humor, School, Shounen",On his first day of high school at the formerl...
81,361908,227,10.0,FLCL,"Action, Avant Garde, Comedy, Mecha, Parody, Ps...",Naota Nandaba is an ordinary sixth grader livi...
82,361908,379,10.0,Heppoko Jikken Animation Excel♥Saga,"Adult Cast, Comedy, Gag Humor, Parody, Sci-Fi,...","It is hard to take over the world, and the eni..."
83,361908,22377,10.0,Wonder Momo,"Action, Martial Arts, School, Strategy Game","One day, a group of aliens planning to take ov..."
84,361908,8487,10.0,Onigamiden,"Action, Award Winning, Mythology, Supernatural...","A 15-year-old boy travels back in time—over 1,..."
85,361908,22535,10.0,Kiseijuu: Sei no Kakuritsu,"Action, Gore, Horror, Psychological, Sci-Fi, S...","All of a sudden, they arrived: parasitic alien..."
86,361908,1078,10.0,Cardcaptor Sakura: Kero-chan ni Omakase!,"Comedy, Gourmet, Shoujo",Following the events of Cardcaptor Sakura Movi...
87,361908,27775,10.0,Plastic Memories,"Drama, Romance, Sci-Fi",Eighteen-year-old Tsukasa Mizugaki has failed ...
88,361908,150,10.0,Blood+,"Action, Drama, Gore, Horror, Military, Mystery...",Saya Otonashi is a seemingly ordinary girl liv...
89,361908,20045,10.0,IS: Infinite Stratos 2 - Hitonatsu no Omoide,"Action, Comedy, Harem, Sci-Fi",An extended version of the second season's fir...
