# 0. Install the openai-clip for text feature extracting

In [None]:
%pip install open_clip_torch -i https://pypi.tuna.tsinghua.edu.cn/simple

# 1. import the libraries and dataset

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import torch
import open_clip
from PIL import Image
from tqdm import tqdm
from sklearn.linear_model import SGDClassifier
from sklearn.utils import shuffle
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import json
import random
import joblib

In [None]:
anime_data_df = pd.read_csv('./anime_data.csv')
rating_df = pd.read_csv('./ratings.csv')

# 2. Prepare for training

In [None]:
# Set label --- Rate >= 6 stands for Like
rating_df['liked'] = rating_df['rating'].apply(lambda x: 1 if x >= 6 else 0)

In [None]:
def batch_generator(indexes, rating_df, anime_data_df, batch_size=128):
    num_samples = len(indexes)
    indexes = np.array(indexes)
    for i in range(0, num_samples, batch_size):
        batch_idx = indexes[i:i+batch_size]
        X_batch = []
        y_batch = []
        for idx in batch_idx:
            row = rating_df.iloc[idx]
            anime_id = row['anime_id']
            rating = row['rating']
            if anime_id in anime_data_df.index:
                features = anime_data_df.loc[anime_id].values.astype(np.float32)
                label = 1 if rating >= 6 else 0
                X_batch.append(features)
                y_batch.append(label)
        yield np.array(X_batch), np.array(y_batch)

In [None]:
# Split the dataset
train_idx, test_idx = train_test_split(rating_df.index, test_size=0.2, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42)

# 3. Train & Evaluate

## 3.1 Model Training

In [None]:
def balanced_batch_generator(indexes, rating_df, anime_data_df, batch_size=128, neg_pos_ratio=1.0):
    # Shuffle the indexes
    indexes = list(indexes)
    random.shuffle(indexes)
    num_samples = len(indexes)
    indexes = np.array(indexes)
    random.shuffle(indexes)
    num_samples = len(indexes)
    indexes = np.array(indexes)

    for i in range(0, num_samples, batch_size * 2):
        batch_idx = indexes[i:i + batch_size * 2]
        pos_samples = []
        neg_samples = []

        for idx in batch_idx:
            row = rating_df.iloc[idx]
            anime_id = row['anime_id']
            rating = row['rating']
            if anime_id not in anime_data_df.index:
                continue

            features = anime_data_df.loc[anime_id].values.astype(np.float32)
            label = 1 if rating >= 6 else 0

            if label == 1:
                pos_samples.append((features, label))
            else:
                neg_samples.append((features, label))

        # Resampling by the given ratio
        num_pos = int(min(len(pos_samples), batch_size // (1 + neg_pos_ratio)))
        num_neg = int(num_pos * neg_pos_ratio)

        pos_samples = random.sample(pos_samples, min(num_pos, len(pos_samples)))
        neg_samples = random.choices(neg_samples, k=min(num_neg, len(neg_samples)))

        # Combine and Shuffle
        batch = pos_samples + neg_samples
        random.shuffle(batch)
        X_batch, y_batch = zip(*batch)

        yield np.array(X_batch), np.array(y_batch)

In [None]:
# Initialize the model
model = SGDClassifier(
    loss='log_loss',
    random_state=42,
    max_iter=1000,
    tol=1e-3,
    n_jobs=-1
)

# Start the training
for epoch in range(10):
    print(f"Epoch {epoch+1}")
    train_generator = balanced_batch_generator(
        train_idx, 
        rating_df, 
        anime_data_df,
        batch_size=4096,
        neg_pos_ratio=2.0
    )
    for X_batch, y_batch in tqdm(train_generator, desc="Training"):
        model.partial_fit(X_batch, y_batch, classes=np.array([0, 1]))
    # Save the model
    joblib.dump(model, f'animeSVCBoost_{epoch+1}.pkl')

## 3.2 Evaluation

In [None]:
# Build the tester generator
test_generator = batch_generator(test_idx, rating_df, anime_data_df, 4096)

all_preds = []
all_true = []

# Test the model
for X_batch, y_batch in tqdm(test_generator, desc="Testing Batches", unit="batch"):
    if len(X_batch) > 0:
        preds = model1.predict(X_batch)
        all_preds.extend(preds)
        all_true.extend(y_batch)

# Output the results of the test
print("\nEvaluation Results:")
print(classification_report(all_true, all_preds, digits=4))