# 1. Import the libraries and dataset

In [3]:
%pip install open_clip_torch -i https://pypi.tuna.tsinghua.edu.cn/simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting open_clip_torch
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/32/f9/0458745c1d299411161ee3b6c32228a3de0be1d8497d779fd7f17a8e96aa/open_clip_torch-2.32.0-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Collecting ftfy (from open_clip_torch)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/ab/6e/81d47999aebc1b155f81eca4477a616a70f238a2549848c38983f3c22a82/ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.9.0->open_clip_torch)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
[2K     

In [4]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import torch
import open_clip
from PIL import Image
from tqdm import tqdm
from sklearn.linear_model import SGDClassifier
from sklearn.utils import shuffle
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import json
import random
import joblib

In [3]:
anime_csv = '/kaggle/input/anime-recommendations-database/anime.csv'
rating_csv = '/kaggle/input/anime-recommendations-database/rating.csv'

anime_df = pd.read_csv(anime_csv)
rating_df = pd.read_csv(rating_csv)

# 2. Data Exploration

## 2.1 Data Visualization

In [None]:
anime_df

In [None]:
# Check the missing values for each column
print("Missing Values Summary")
print(anime_df.isnull().sum())

# Check the duplicated rows
print("The number of duplicated rows: ", anime_df.duplicated().sum())

In [None]:
rating_df

In [None]:
# Check the missing values for each column
print("Missing Values Summary")
print(rating_df.isnull().sum())

# Check the duplicated rows
print("The number of duplicated rows: ", rating_df.duplicated().sum())

## 2.2 Clean the dataset

In [None]:
# Replace the missing rating values in anime_df with the average value
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)

# Missing values in Genre and Type column will be replaced with "Unknown" 
anime_df['genre'].fillna("Unknown", inplace=True)
anime_df['type'].fillna("Unknown", inplace=True)
anime_df['episodes'] = anime_df['episodes'].replace('Unknown', 0).astype(int)

In [None]:
# Drop the duplicated row in rating_df
rating_df.drop_duplicates(inplace=True)
rating_df = rating_df.reset_index(drop=True)

# 3. Data Pre-processing

## 3.1 Encode the categorical features(genre & type)

In [None]:
# --- Step 1: Encode the Genre column ---

# Split the Genre column value to a list of categories
genre_split = anime_df['genre'].fillna("").apply(lambda x: [g.strip() for g in x.split(',') if g.strip()])

# Apply MultiLabelBinarizer to do multi-hot encoding
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(genre_split), columns=[f"genre_{g}" for g in mlb.classes_])

In [None]:
# --- Step 2: Encode the Type column ---
type_encoder = LabelEncoder()
type_encoded = pd.Series(type_encoder.fit_transform(anime_df['type']), name='type_encoded')

## 3.2 Encode the name by CLIP

In [None]:
# --- Step 3: Encode the Name column ---
# Initialize the model
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
tokenizer = open_clip.get_tokenizer('ViT-B-32')
model.eval().cuda()

# Convert anime name to a list
texts = anime_df['name'].astype(str).tolist()

# A list for encoding features
all_text_features = []

# Deal them with 32 batch
batch_size = 32
for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    with torch.no_grad():
        tokens = tokenizer(batch_texts).cuda()
        features = model.encode_text(tokens)
        features = features / features.norm(dim=1, keepdim=True)
        # Move features to cpu to avoid of running out of memory
        all_text_features.append(features.cpu())
    del tokens, features
    # Release the memory
    torch.cuda.empty_cache()

# Contact all the features
text_features_tensor = torch.cat(all_text_features, dim=0)

## 3.3 Build a new dataframe for model fit

In [None]:
anime_data_df = anime_df.drop(columns=['genre', 'type', 'name'])
text_feature_df = pd.DataFrame(text_features_tensor.numpy())
anime_data_df = pd.concat([anime_data_df, genre_encoded, text_feature_df], axis=1)

In [None]:
anime_data_df

## 3.4 Save the well-preprocessed data to local

In [None]:
anime_data_df.to_csv('anime_data.csv', index=False)
rating_df.to_csv('ratings.csv', index=False)

## 3.5 Read data from local

In [5]:
anime_data_df = pd.read_csv('/kaggle/input/processeddata/anime_data.csv')
rating_df = pd.read_csv('/kaggle/input/processeddata/ratings.csv')

# 4. Model build & training

## 4.1 Prepare the dataset

In [6]:
# Set label --- Rate >= 6 stands for Like
rating_df['liked'] = rating_df['rating'].apply(lambda x: 1 if x >= 6 else 0)

In [7]:
def batch_generator(indexes, rating_df, anime_data_df, batch_size=128):
    num_samples = len(indexes)
    indexes = np.array(indexes)
    for i in range(0, num_samples, batch_size):
        batch_idx = indexes[i:i+batch_size]
        X_batch = []
        y_batch = []
        for idx in batch_idx:
            row = rating_df.iloc[idx]
            anime_id = row['anime_id']
            rating = row['rating']
            if anime_id in anime_data_df.index:
                features = anime_data_df.loc[anime_id].values.astype(np.float32)
                label = 1 if rating >= 6 else 0
                X_batch.append(features)
                y_batch.append(label)
        yield np.array(X_batch), np.array(y_batch)

In [8]:
# Split the dataset
train_idx, test_idx = train_test_split(rating_df.index, test_size=0.2, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42)

## 4.2 Train a SVM model for classification

In [None]:
# Initialize the model
model = SGDClassifier(loss="hinge", max_iter=1, warm_start=True)

# SVM Training
for epoch in range(10):
    print(f"Epoch {epoch+1}")
    for X_batch, y_batch in tqdm(batch_generator(train_idx, rating_df, anime_data_df, 4096)):
        if len(X_batch) > 0:
            model.partial_fit(X_batch, y_batch, classes=np.array([0, 1]))

## 4.3 Save the model

In [None]:
# Fetch all the parameters of the model
params = model.get_params() 

# Get the weights and bias
coef = model.coef_.tolist()
intercept = model.intercept_.tolist()

# Save to a dict
model_data = {
    "params": params,
    "coef": coef,
    "intercept": intercept
}

# Save as json
with open('sgd_model_params.json', 'w') as f:
    json.dump(model_data, f)

## 4.4 Evaluate the model

In [None]:
# Build the tester generator
test_generator = batch_generator(test_idx, rating_df, anime_data_df, 4096)

all_preds = []
all_true = []

# Test the model
for X_batch, y_batch in tqdm(test_generator, desc="Testing Batches", unit="batch"):
    if len(X_batch) > 0:
        preds = model.predict(X_batch)
        all_preds.extend(preds)
        all_true.extend(y_batch)

# Output the results of the test
print("\nEvaluation Results:")
print(classification_report(all_true, all_preds, digits=4))

# 5. Model Boosting

## 5.1 Refine the generator

In [9]:
def balanced_batch_generator(indexes, rating_df, anime_data_df, batch_size=128, neg_pos_ratio=1.0):
    # Shuffle the indexes
    indexes = list(indexes)
    random.shuffle(indexes)
    num_samples = len(indexes)
    indexes = np.array(indexes)
    random.shuffle(indexes)
    num_samples = len(indexes)
    indexes = np.array(indexes)

    for i in range(0, num_samples, batch_size * 2):
        batch_idx = indexes[i:i + batch_size * 2]
        pos_samples = []
        neg_samples = []

        for idx in batch_idx:
            row = rating_df.iloc[idx]
            anime_id = row['anime_id']
            rating = row['rating']
            if anime_id not in anime_data_df.index:
                continue

            features = anime_data_df.loc[anime_id].values.astype(np.float32)
            label = 1 if rating >= 6 else 0

            if label == 1:
                pos_samples.append((features, label))
            else:
                neg_samples.append((features, label))

        # Resampling by the given ratio
        num_pos = int(min(len(pos_samples), batch_size // (1 + neg_pos_ratio)))
        num_neg = int(num_pos * neg_pos_ratio)

        pos_samples = random.sample(pos_samples, min(num_pos, len(pos_samples)))
        neg_samples = random.choices(neg_samples, k=min(num_neg, len(neg_samples)))

        # Combine and Shuffle
        batch = pos_samples + neg_samples
        random.shuffle(batch)
        X_batch, y_batch = zip(*batch)

        yield np.array(X_batch), np.array(y_batch)

## 5.2 Balance Training

In [11]:
# Initialize the model
model = SGDClassifier(
    loss='log_loss',
    random_state=42,
    max_iter=1000,
    tol=1e-3,
    n_jobs=-1
)

# Start the training
for epoch in range(10):
    print(f"Epoch {epoch+1}")
    train_generator = balanced_batch_generator(
        train_idx, 
        rating_df, 
        anime_data_df,
        batch_size=4096,
        neg_pos_ratio=2.0
    )
    for X_batch, y_batch in tqdm(train_generator, desc="Training"):
        model.partial_fit(X_batch, y_batch, classes=np.array([0, 1]))
    # Save the model
    joblib.dump(model, f'animeSVCBoost_{epoch+1}.pkl')

Epoch 1


Training: 687it [27:30,  2.40s/it]


Epoch 2


Training: 687it [27:35,  2.41s/it]


Epoch 3


Training: 687it [27:26,  2.40s/it]


Epoch 4


Training: 687it [27:24,  2.39s/it]


Epoch 5


Training: 687it [27:35,  2.41s/it]


Epoch 6


Training: 687it [27:41,  2.42s/it]


Epoch 7


Training: 687it [27:38,  2.41s/it]


Epoch 8


Training: 687it [27:45,  2.42s/it]


Epoch 9


Training: 687it [27:42,  2.42s/it]


Epoch 10


Training: 687it [27:35,  2.41s/it]


In [13]:
# Build the tester generator
test_generator = batch_generator(test_idx, rating_df, anime_data_df, 4096)

all_preds = []
all_true = []

# Test the model
for X_batch, y_batch in tqdm(test_generator, desc="Testing Batches", unit="batch"):
    if len(X_batch) > 0:
        preds = model.predict(X_batch)
        all_preds.extend(preds)
        all_true.extend(y_batch)

# Output the results of the test
print("\nEvaluation Results:")
print(classification_report(all_true, all_preds, digits=4))

Testing Batches: 382batch [08:19,  1.31s/batch]



Evaluation Results:
              precision    recall  f1-score   support

           0     0.2441    0.3291    0.2803    281430
           1     0.7474    0.6608    0.7014    845415

    accuracy                         0.5779   1126845
   macro avg     0.4957    0.4949    0.4908   1126845
weighted avg     0.6217    0.5779    0.5962   1126845

