In [1]:
import pandas as pd
import numpy as np
import pickle
import itertools
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

In [2]:
def load_models_and_data():
    model = {}
    prepared_data = {}

    model_path = '../streamlit-app/models/random_forest.pkl'
    data_path = '../streamlit-app/data/prepared_random_forest_data.pkl'
    
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        with open(data_path, 'rb') as f:
            prepared_data = pickle.load(f)
    except Exception as e:
        print(f"Failed to load {model} model or data: {str(e)}")
        return None, None
    
    return model, prepared_data

model, prepared_data = load_models_and_data()

genres = ["Action", "Adventure", "Comedy", "Drama", "Sci-Fi", "Fantasy", "Romance", "Slice of Life",
          "Mystery", "Supernatural", "Sports", "Historical", "Horror", "Psychological", "Thriller",
          "Ecchi", "Mecha", "Music", "Harem", "Gourmet", "Parody", "Dementia", "Super Power", "School",
          "Josei", "Vampire", "Hentai", "Police", "Space", "Demons", "Martial Arts", "Military", "Cars",
          "Samurai", "Magic", "Kids", "Game", "Shoujo Ai", "Shounen Ai", "Yaoi", "Yuri", "Isekai",
          "Seinen", "Shounen"]

types = ["TV", "Movie", "OVA", "ONA", "Special", "Music", "PV", "CM", "TV Special"]

sources = ["Original", "Manga", "Light novel", "Visual novel", "Game", "Novel", "4-koma manga", "Book",
           "Card game", "Music", "Mixed media", "Picture book", "Web manga", "Other"]

demographics = ["Shounen", "Seinen", "Shoujo", "Josei", "Kids"]

all_features = {
    'type': types,
    'source': sources,
    'demographic': demographics,
    'genre': genres,
    'producers_count': range(1, 21),
    'platform_count': range(1, 11)
}

In [3]:
def preprocess_features(features, model, prepared_data):
    expected_features = model.feature_names_in_.tolist()
    processed_features = pd.DataFrame(index=features.index)

    if 'genre' in features.columns:
        genres = features['genre'].apply(lambda x: ','.join(x) if isinstance(x, list) else x)
        for char in set(''.join(expected_features)):
            if char.isalnum() or char in ["'", ' ', ',']:
                processed_features[char] = genres.str.contains(char).astype(int)

    categorical_features = ['type', 'source', 'demographic']
    for feature in categorical_features:
        if feature in features.columns:
            for expected_feature in expected_features:
                if expected_feature.startswith(f"{feature.capitalize()}_"):
                    category = expected_feature.split('_', 1)[1]
                    processed_features[expected_feature] = (features[feature] == category).astype(int)

    if 'producers_count' in features.columns:
        processed_features['Producer_Count'] = features['producers_count']
    if 'platform_count' in features.columns:
        processed_features['Platform_Count'] = features['platform_count']

    scaler = prepared_data.get('scaler')
    if scaler:
        numerical_features = ['Producer_Count', 'Platform_Count']
        processed_features[numerical_features] = scaler.transform(processed_features[numerical_features])

    for feature in expected_features:
        if feature not in processed_features.columns:
            processed_features[feature] = 0

    processed_features = processed_features[expected_features]

    return processed_features

In [4]:
def evaluate_combination(combination):
    features = pd.DataFrame([dict(zip(all_features.keys(), combination))])
    processed_features = preprocess_features(features, model, prepared_data)
    rank = model.predict(processed_features)[0]
    return combination, rank

def find_optimal_combination(fixed_features=None):
    if fixed_features:
        for feature, value in fixed_features.items():
            all_features[feature] = [value]
    
    combinations = list(itertools.product(*all_features.values()))
    best_combination = None
    best_rank = float('inf')

    with Pool(cpu_count()) as pool:
        for combination, rank in tqdm(pool.imap_unordered(evaluate_combination, combinations), total=len(combinations)):
            if rank < best_rank:
                best_rank = rank
                best_combination = dict(zip(all_features.keys(), combination))
    
    return best_combination, best_rank

In [5]:
overall_best_combo, overall_best_rank = find_optimal_combination()

  0%|          | 0/5544000 [00:00<?, ?it/s]

In [None]:
with open('../streamlit-app/models/overall_best_combination.pkl', 'wb') as f:
    pickle.dump((overall_best_combo, overall_best_rank), f)

In [None]:
specific_best_combos = {}
for feature in ['type', 'source', 'demographic', 'genre']:
    specific_best_combos[feature] = {}
    for value in all_features[feature]:
        best_combo, best_rank = find_optimal_combination({feature: value})
        specific_best_combos[feature][value] = (best_combo, best_rank)

In [None]:
with open('../streamlit-app/models/specific_best_combinations.pkl', 'wb') as f:
    pickle.dump(specific_best_combos, f)